scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26728B)
      1 {
      2   "paper": {
      3     "title": "Neuro-Symbolic Verification on Instruction Following of LLMs",
      4     "authors": ["Yiming Su", "Kunzhao Xu", "Yanjie Gao", "Fan Yang", "Cheng Li", "Mao Yang", "Tianyin Xu"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.17789",
      8     "doi": "10.48550/arXiv.2601.17789"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The footnote states 'This paper is a result of an open-source project' but no repository URL or archive link is provided in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "VIFBENCH is described in detail but no download link or repository URL for the benchmark data is provided."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper only mentions models are queried through API interfaces."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions or README are provided. Prompts are included in Appendix B but no instructions for running the full pipeline."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Tables 2 and 3 report only point estimates (e.g., '94.8% F1 score') with no confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims NSVIF 'significantly outperforms' baselines but reports no statistical significance tests — comparisons are based solely on comparing point estimates."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Table 2 reports percentage improvements with baseline context (e.g., '25.7%↑' from 69.1% to 94.8% F1), providing enough to gauge effect magnitude."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "VIFBENCH contains 820 data points. No justification is given for why this size was chosen or whether it provides sufficient statistical power."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures across runs are reported. Results appear to be from single runs at temperature 0.2."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are included: standard LLM-as-a-judge (§5), GEPA-COT, CONV-COT, and NSVIF-NEU ablation (Table 3)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines use contemporary models (GPT-4.1, DeepSeek-R1/V3.1, Qwen3-Max) and recent prompt optimization techniques (GEPA from DSPy, conversation-based CoT)."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 3 presents an explicit ablation study (§5.1.1) comparing NSVIF-NEU (without logic reasoning) and the full NSVIF, along with intermediate variants."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Four metrics are used: F1 score, Precision, Recall, and Pass@1 (§5, Table 2)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Evaluation is entirely automated against VIFBENCH labels. No human evaluation of NSVIF's verification outputs is performed. The semantic constraints in VIFBENCH were spot-checked by authors during data generation (§A.2) but this is dataset construction, not system evaluation."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "VIFBENCH is a new benchmark not used for tuning NSVIF. NSVIF's in-context examples (Appendix B) are hand-crafted examples unrelated to VIFBENCH. GEPA-COT was optimized with a small minibatch (size=3) but the full 820-point benchmark is the evaluation set."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Figure 4 breaks down F1 by number of constraints (2-10). Table 4 breaks down false positives vs false negatives. Table 2 breaks down by model."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "§5.3 provides detailed analysis of 23 false positives and 18 false negatives, categorizing root causes (hallucination, checker failures, misclassification). §5.2 discusses GPT-4o-mini failure modes."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "GPT-4o-mini makes NSVIF worse than the baseline (37.3% vs 66.6% F1, Table 2), which is explicitly discussed as an unexpected negative finding in §5.2."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims NSVIF 'significantly outperforms LLM-based approaches' — supported by Table 2 showing 7-25.7% F1 improvements across models. Claim about interpretable feedback is supported by the constraint-level explanations described in §3.2. Claim about improving instruction following without post-training is supported by §5.4."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The ablation study (Table 3) makes causal claims about component contributions through controlled single-variable manipulation: NSVIF vs NSVIF-NEU isolates the logic reasoning component. The design is adequate for these claims."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper claims NSVIF is 'universal, general-purpose' (abstract, §1, §3) but evaluation is limited to VIFBENCH which covers only English writing tasks with 8 logic and 2 semantic constraint types. The limitations (§8) acknowledge this but the title and abstract framing far exceeds the evaluation scope."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not consider alternative explanations for NSVIF's improvements beyond the intended neuro-symbolic mechanism. For instance, the additional compute budget (100x more tokens per Table 7) is not discussed as a potential confound."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures F1/precision/recall on VIFBENCH for instruction-following verification, and claims are about verification effectiveness. The measurements match the claims at the granularity presented."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "§5 specifies model versions with dates: GPT-4.1 (2025-04-14), GPT-4o (2024-11-20), DeepSeek-R1 (0528), Qwen3-Max (2025-09-23), GPT-4o-mini (2024-07-18)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompts for all three NSVIF agents (formulation, checking, solver), NSVIF-NEU, baseline, GEPA-COT, CONV-COT, and the travel agent are provided in Appendix B."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature 0.2 is stated for all models (§A.5). GEPA parameters are reported: max_full_evals=1, num_threads=1, reflection_minibatch_size=3. Self-reflection budget of 3 retries for checker code."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The multi-agent scaffold is described in detail in §3.2: formulation agent, checking agent, and solver agent, with workflow diagram (Figure 1), retry logic (3 self-reflections), and fallback mechanism."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "§4 and Appendix A document VIFBENCH construction: constraint selection from existing datasets, instruction synthesis via Cartesian products, output generation with sat/unsat balancing, mutation strategy for unsat examples, and re-generation budget of 5."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "§8 'Limitations' provides substantive discussion covering CSP formulation limits, LLM hallucination paradox, benchmark scope, and model coverage."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "§8 discusses specific threats: CSP cannot handle soft vs hard constraints, unsatisfiable instructions could cause infinite retries, VIFBENCH limited to English writing tasks, LLM hallucination during verification is irreducible, and evaluation limited to available models."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "§8 explicitly states what was not tested: multilingual instructions, tasks beyond writing (information retrieval, QA, planning), and LLMs beyond those evaluated. §2.1 also clarifies NSVIF checks behavior correctness, not linguistic quality."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "VIFBENCH data (820 instruction-output-result tuples) is not made available for download. No data archive or repository link is provided."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "§4 describes in detail how VIFBENCH was constructed: constraint types sourced from instruction-tuning datasets and benchmarks (Table 5), constraint values generated by GPT-4.1, instructions synthesized via Cartesian product permutation, outputs generated by GPT-4.1 with sat/unsat balancing."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. VIFBENCH is entirely synthetic."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "§4.1-4.2 and Appendix A document the full pipeline: constraint definition → Cartesian product combinations (with 100-cap) → instruction synthesis via templates → GPT-4.1 output generation → sat verification with re-prompting (budget=5) → mutation for unsat examples. Counts given: 820 total, 391 sat, 429 unsat."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding acknowledgments are provided. The paper mentions Kunzhao Xu's internship at Microsoft Research but no funding sources are disclosed."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: UIUC, USTC, and Microsoft Research. The MSR internship is noted in the footnote."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Three authors are from Microsoft Research. The paper evaluates Azure AI Foundry models (GPT-4.1, GPT-4o, GPT-4o-mini) and the models are 'provided by Azure AI Foundry.' Microsoft has a financial interest in demonstrating the need for better LLM verification tools. This dependency is not acknowledged."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial disclosure statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-4.1, GPT-4o, DeepSeek-R1/V3.1, Qwen3-Max)."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether VIFBENCH data or similar instruction-following examples could have appeared in model training data. While VIFBENCH is newly created, the constraint types and writing topics may overlap with training data."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "VIFBENCH is new and presumably not in training data, but this is not explicitly discussed or verified. The paper does not address contamination risk."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Table 7 reports input and output token usage for every model and verifier combination on VIFBENCH, showing e.g., NSVIF with GPT-4.1 uses 130M input + 6.7M output tokens."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Table 7 provides total token consumption for all experiments. Models are accessed via API (Azure AI Foundry, Alibaba Cloud) so token counts serve as the compute budget metric."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No seed sensitivity analysis. Results are reported at temperature 0.2 but no multiple-seed experiments are conducted."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs is not stated. Results appear to be from single runs."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget is reported for NSVIF. The self-reflection count (3) and temperature (0.2) appear chosen without documented search."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Default configurations are used without justification. The choice of 3 self-reflections, temperature 0.2, and specific prompt designs are not justified through systematic comparison."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple comparisons are made across 6 models × 4 verifiers with no correction for multiple comparisons."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Authors evaluate their own system (NSVIF) and benchmark (VIFBENCH) without acknowledging this bias. The benchmark was designed by the same team that built NSVIF."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "NSVIF uses ~130M input tokens vs ~812K for the baseline (Table 7) — roughly 160x more compute. This massive cost difference is not discussed as a factor in the performance comparison."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "§4 discusses why LLMBar is inadequate (preference ≠ instruction following) and designs VIFBENCH with fine-grained constraint labels to address this validity gap. The paper explicitly discusses what the benchmark measures vs what is claimed."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "When comparing across models (Table 2), the same NSVIF scaffold is used for all models, and the same baseline prompt is used for all models, controlling the scaffold variable. The scaffold IS the independent variable when comparing NSVIF vs baseline."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of temporal leakage. While VIFBENCH is newly created, the paper does not explicitly state when it was created relative to model training cutoffs."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup could leak answer information. For instance, NSVIF provides the full instruction and output to the LLM multiple times across agents."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "VIFBENCH instructions are generated from Cartesian products of constraint combinations, creating structural similarities across test examples. This non-independence is not discussed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention method is used or discussed."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "NSVIF significantly outperforms LLM-as-a-judge baselines, achieving up to 25.7% F1 improvement on VIFBENCH.",
    363       "evidence": "Table 2 shows NSVIF with GPT-4.1 achieves 94.8% F1 vs 69.1% for baseline (25.7%↑). Similar improvements across DeepSeek-V3.1 (22.0%↑) and Qwen3-Max (23.2%↑).",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "The neuro-symbolic approach with logic reasoning substantially improves verification over using LLMs alone.",
    368       "evidence": "Table 3 ablation: NSVIF (94.8% F1) vs NSVIF-NEU without logic reasoning (84.0% F1) vs best LLM-only approach CONV-COT (78.9% F1), all using GPT-4.1.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "NSVIF provides interpretable feedback that helps LLMs improve instruction following without post-training.",
    373       "evidence": "§5.4: On 9 selected VIFBENCH instructions, GPT-4.1 produces satisfying outputs in <5 iterations with NSVIF feedback, vs 3/9 failing within 15 iterations with boolean-only feedback.",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "NSVIF is a universal, general-purpose verifier making no assumption on the instruction or LLM.",
    378       "evidence": "The framework description (§3) makes no structural assumptions, but evaluation (§5) is limited to VIFBENCH's English writing tasks with 10 constraint types.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "Small models (GPT-4o-mini) are ineffective for NSVIF due to constraint misclassification.",
    383       "evidence": "§5.2: GPT-4o-mini misclassifies constraints in 508/820 instructions, causing NSVIF (37.3% F1) to underperform the baseline (66.6% F1).",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "NSVIF, a neuro-symbolic framework for verifying LLM instruction following, formulates verification as a constraint-satisfaction problem combining logic reasoning (via Z3 SMT solver) with semantic analysis. On VIFBENCH (820 examples), NSVIF achieves 94.8% F1 with GPT-4.1, outperforming standard LLM-as-a-judge by 25.7%. Ablation shows logic reasoning for symbolic constraints accounts for ~11% F1 improvement over neural-only verification. However, NSVIF requires ~160x more tokens than the baseline and fails with smaller models (GPT-4o-mini) due to constraint misclassification.",
    389   "red_flags": [
    390     {
    391       "flag": "Massive compute cost difference ignored",
    392       "detail": "NSVIF uses ~130M input tokens vs ~812K for the baseline (Table 7), approximately 160x more compute. This enormous cost difference is not discussed when comparing effectiveness, making the comparison misleading — the baseline could potentially improve with equivalent compute budget (e.g., multiple LLM calls, ensemble voting)."
    393     },
    394     {
    395       "flag": "Authors evaluate on their own benchmark",
    396       "detail": "Both NSVIF and VIFBENCH are created by the same team. The benchmark's constraint types (8 logic + 2 semantic, all in English writing) may favor NSVIF's architecture. No external benchmark evaluation is included."
    397     },
    398     {
    399       "flag": "Universality claims far exceed evaluation scope",
    400       "detail": "The paper repeatedly claims NSVIF is 'universal, general-purpose' but evaluates only on English writing tasks with 10 constraint types. Real-world instruction following involves code generation, multi-modal outputs, tool use, and complex reasoning — none tested."
    401     },
    402     {
    403       "flag": "No variance or significance testing",
    404       "detail": "All results are point estimates from apparent single runs at temperature 0.2. LLM outputs are stochastic; without multiple runs and significance tests, the claimed improvements may not be reliable."
    405     },
    406     {
    407       "flag": "Multi-turn feedback evaluation on 9 cherry-picked examples",
    408       "detail": "§5.4 evaluates NSVIF's feedback loop on only 9 instructions 'which the LLM failed to follow,' sampled by the authors. This tiny, non-random sample cannot support generalizable claims about NSVIF improving instruction following."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Why Do Multi-Agent LLM Systems Fail?",
    414       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    415       "year": 2025,
    416       "relevance": "Studies failure modes in multi-agent LLM systems, directly relevant to agentic AI reliability."
    417     },
    418     {
    419       "title": "Evaluating Large Language Models Trained on Code",
    420       "authors": ["Mark Chen", "Jerry Tworek"],
    421       "year": 2021,
    422       "arxiv_id": "2107.03374",
    423       "relevance": "Introduced Codex/HumanEval benchmark and Pass@k metric used in this paper's evaluation."
    424     },
    425     {
    426       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    427       "authors": ["Carlos E. Jimenez", "John Yang"],
    428       "year": 2024,
    429       "relevance": "Major benchmark for evaluating LLM code generation agents on real software engineering tasks."
    430     },
    431     {
    432       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    433       "authors": ["Omar Khattab", "Arnav Singhvi"],
    434       "year": 2024,
    435       "relevance": "Framework for structured LLM programming, used for GEPA-COT baseline optimization in this paper."
    436     },
    437     {
    438       "title": "LLMs Get Lost In Multi-Turn Conversation",
    439       "authors": ["Philippe Laban", "Hiroaki Hayashi"],
    440       "year": 2025,
    441       "arxiv_id": "2505.06120",
    442       "relevance": "Studies LLM instruction following degradation in multi-turn settings, relevant to agentic workflow reliability."
    443     },
    444     {
    445       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    446       "authors": ["Lianmin Zheng", "Wei-Lin Chiang"],
    447       "year": 2023,
    448       "relevance": "Foundational work on LLM-as-a-judge methodology, the baseline approach this paper improves upon."
    449     },
    450     {
    451       "title": "Benchmarking Complex Instruction-Following with Multiple Constraints Composition",
    452       "authors": ["Bosi Wen", "Pei Ke"],
    453       "year": 2024,
    454       "relevance": "ComplexBench — instruction following benchmark that models instructions as constraint compositions, directly inspiring VIFBENCH."
    455     },
    456     {
    457       "title": "Instruction-Following Evaluation for Large Language Models",
    458       "authors": ["Jeffrey Zhou", "Tianjian Lu"],
    459       "year": 2023,
    460       "arxiv_id": "2311.07911",
    461       "relevance": "IFEval benchmark for LLM instruction following, part of the evaluation landscape this paper builds on."
    462     },
    463     {
    464       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    465       "authors": ["Qingyun Wu", "Gagan Bansal"],
    466       "year": 2024,
    467       "relevance": "Multi-agent LLM framework; the CONV-COT baseline uses a conversation-based approach inspired by this work."
    468     },
    469     {
    470       "title": "LINC: A Neurosymbolic Approach for Logical Reasoning by Combining Language Models with First-Order Logic Provers",
    471       "authors": ["Theo Olausson", "Alex Gu"],
    472       "year": 2023,
    473       "relevance": "Prior neurosymbolic approach combining LLMs with formal logic, directly inspiring NSVIF's architecture."
    474     },
    475     {
    476       "title": "SWE-smith: Scaling Data for Software Engineering Agents",
    477       "authors": ["John Yang", "Kilian Lieret"],
    478       "year": 2025,
    479       "arxiv_id": "2504.21798",
    480       "relevance": "Scaling training data for SE agents; cited for Python being the best LLM-supported language."
    481     },
    482     {
    483       "title": "AGENTIF: Benchmarking Instruction Following of Large Language Models in Agentic Scenarios",
    484       "authors": ["Yunjia Qi", "Hao Peng"],
    485       "year": 2025,
    486       "arxiv_id": "2505.16944",
    487       "relevance": "Benchmark specifically for instruction following in agentic contexts, directly related to NSVIF's target use case."
    488     }
    489   ]
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs