scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32869B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Decoding ML Decision: An Agentic Reasoning Framework for Large-Scale Ranking System",
      6     "authors": [
      7       "Longfei Yun",
      8       "Yihan Wu",
      9       "Haoran Liu",
     10       "Xiaoxuan Liu",
     11       "Ziyun Xu",
     12       "Yi Wang",
     13       "Yang Xia",
     14       "Pengfei Wang",
     15       "Mingze Gao",
     16       "Yunxiang Wang",
     17       "Changfan Chen",
     18       "Junfeng Pan"
     19     ],
     20     "year": 2026,
     21     "venue": "arXiv",
     22     "arxiv_id": "2602.18640",
     23     "doi": null
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The abstract claims GEARS 'consistently identifies superior, near-Pareto-efficient policies' and enables 'high-level intent vibe personalization.' The offline evaluation (Table 1) supports policy selection accuracy, but the 'near-Pareto-efficient' claim is only illustrated in a single Figure 3 example. The 'vibe personalization' claim has no quantitative evaluation.",
     31         "source": "opus"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper makes causal claims through ablation ('removing the bash-based filtering stage leads to a substantial degradation') which are adequate. However, the real-world deployment claims (Table 3) attribute metric improvements to GEARS without controlling for confounds. The case study (§5.1) claims different user types 'react differently' to treatments, which is a causal claim from observational cohort analysis.",
     37         "source": "opus"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The title says 'Large-Scale Ranking System' (general) but results are exclusively from Meta's internal systems. The conclusion calls GEARS 'a new standard for AI-driven ranking infrastructure' without bounding this to Meta's specific context. No acknowledgment that results may not transfer to other companies' ranking systems.",
     43         "source": "opus"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No alternative explanations are discussed. For example, the superiority over baselines could be partly due to the baselines lacking access to the same domain knowledge or tools, not the agentic framework per se. The real-world improvements could reflect the value of the underlying GAS algorithm rather than the agentic layer.",
     49         "source": "opus"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The offline evaluation uses ground-truth top-5 policies as the target, but 'ground truth' is computed from the same metric measurements the system optimizes. The paper does not discuss whether ranking accuracy on internal metrics translates to actual user-facing improvements. Table 3 shows real-world metrics but uses anonymized 'Metric 1/2/3' without discussing what these proxy for.",
     55         "source": "opus"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "There is no dedicated limitations or threats-to-validity section in the paper.",
     63         "source": "opus"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     69         "source": "opus"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper does not explicitly state what GEARS does NOT show or where it would not apply. The conclusion calls it 'a new standard for AI-driven ranking infrastructure' without any scope boundaries.",
     75         "source": "opus"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding or acknowledgments section is present. All authors are from Meta but no funding disclosure is made.",
     83         "source": "opus"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All authors are listed with Meta affiliation on the first page.",
     89         "source": "opus"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Meta employees are evaluating a Meta-internal system (GEARS) designed for Meta's ranking infrastructure. Meta has a direct financial interest in showing their internal tooling is effective.",
     95         "source": "opus"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests or financial interests statement is provided. Authors are Meta employees evaluating a system that improves Meta's core business (ranking/recommendation).",
    101         "source": "opus"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Key terms lack precise definitions: 'Vibe Optimization' described functionally but not formally defined; 'Deterministic Lifecycle Governance' is vague; 'Specialized Agent Skills' described as concept but unclear what constitutes a skill operationally.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "Three contributions stated but not clearly differentiated from prior work. What is novel about framing as 'autonomous discovery' vs. existing agent/optimization research?",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Related work lists papers but does not deeply engage. For uplift modeling: identifies gap ('may never ship') but doesn't explain why GEARS beats alternatives like Bayesian optimization. No explicit comparison statements.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "No code repository URL or archive is provided anywhere in the paper. The system is built on internal Meta infrastructure.",
    132           "source": "opus"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The benchmark dataset is constructed from 20 internal Meta experiments. No data is released or publicly available.",
    138           "source": "opus"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No environment specifications, dependency lists, or setup instructions are provided.",
    144           "source": "opus"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No reproduction instructions are provided. The system relies on internal Meta infrastructure and proprietary experiments.",
    150           "source": "opus"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Table 4 reports results with standard errors (e.g., '−0.049% ± 0.043'). However, the main comparison in Table 1 reports only point estimates without uncertainty.",
    158           "source": "opus"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper claims GEARS 'consistently outperforms all baselines' (Table 1) but no statistical significance tests are applied to any of the comparisons. Differences are assessed by comparing raw numbers only.",
    164           "source": "opus"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Table 1 provides absolute metric values for all methods, and Table 3 reports percentage lifts for real-world deployments. Table 4 gives percentage lift with standard errors. This provides context for the magnitude of effects.",
    170           "source": "opus"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The benchmark uses 100 instructions (20 experiments × 5 types). No justification is given for why 20 experiments is sufficient, and no power analysis is discussed.",
    176           "source": "opus"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Table 1 (the main results) reports single-run point estimates with no standard deviation or variance across runs. Implementation details mention Self-Consistency samples 5 responses, but GEARS results show no variance measures.",
    182           "source": "opus"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Table 1 compares against 5 baselines: Naive Prompting, Chain-of-Thought, Self-Consistency, Self-Refine, and Code-as-Action.",
    190           "source": "opus"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Baselines include recent methods: Code-as-Action (Wang et al., 2024), Self-Refine (Madaan et al., 2023), Self-Consistency (Wang et al., 2022), and CoT (Wei et al., 2022). These represent the contemporary prompting landscape.",
    196           "source": "opus"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Table 1 includes two ablation variants: 'GEARS w/o Bash' and 'GEARS w/o Skill', isolating the contributions of the bash filtering and skill modules.",
    202           "source": "opus"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Table 1 reports 12 metrics: nDCG@1/3/5, Precision@1/3/5, Rank Correlation, Recall@1/3/5, Top-1 Accuracy, and Top-1 in GT.",
    208           "source": "opus"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "No human evaluation of GEARS outputs is performed. All evaluation is automated using ground-truth policy rankings computed from metric measurements.",
    214           "source": "opus"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": false,
    219           "justification": "No description of train/test/validation splits. The 100 instructions are evaluated but there is no mention of any held-out set or how the system was developed vs. evaluated.",
    220           "source": "opus"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Results in Table 1 are aggregated across all 100 instructions. No breakdown by instruction type (Maximize Both, Tradeoff Analysis, etc.) is provided despite having 5 distinct scenario types.",
    226           "source": "opus"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": false,
    231           "justification": "No failure cases are discussed. The paper does not analyze when GEARS fails or where specific baselines outperform it.",
    232           "source": "opus"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "The 'GEARS w/o Bash' ablation shows dramatically degraded performance (e.g., Top-1 Acc drops from 86% to 26%), which is a negative result demonstrating that without filtering, the approach fails badly.",
    238           "source": "opus"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Section 4.1 states 'All experiments use Claude Sonnet (ant) as the backbone LLM' but provides no specific version, snapshot date, or API version. 'Claude Sonnet' without a version identifier is insufficient.",
    246           "source": "opus"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "The paper describes Skills architecture and progressive disclosure strategy in general terms but does not provide actual prompt text, system instructions, or skill definitions used in experiments.",
    252           "source": "opus"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Section 4.1 reports temperature 0.7 for Self-Consistency with 5 samples, and single refinement iteration for Self-Refine. The tolerance hyperparameter τ is defined in Algorithm 1.",
    258           "source": "opus"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The paper describes the agentic scaffolding in detail: Specialized Agent Skills (§3.2), progressive disclosure strategy with three stages, Domain Knowledge Brain, subagent delegation, and deterministic lifecycle governance hooks (§3.3). Figure 1 shows the end-to-end workflow.",
    264           "source": "opus"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Section 4.1 describes how the benchmark was constructed: 20 internal experiments → GAS generates hundreds of policy candidates → 5 instruction types synthesized per experiment → 100 total instructions. Ground-truth computed as top-5 policies per criteria.",
    270           "source": "opus"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "No raw data is available. All experiments use internal Meta data that is not released.",
    278           "source": "opus"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Section 4.1 describes how the benchmark was constructed: 20 internal experiments with GAS-generated policy candidates. Section 4.3.1 describes the feature stability benchmarking methodology including User-Cohort Shift Ratio and 6-month windows.",
    284           "source": "opus"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "No human participants. The study uses internal experimental data from Meta's ranking systems, not recruited participants.",
    290           "source": "opus"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "Section 4.1 documents the pipeline: experiments → GAS algorithm → policy candidates with metrics → 5 instruction types per experiment → ground-truth computation as top-5 policies. Algorithm 1 documents the tolerance-based filtering pipeline.",
    296           "source": "opus"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "The paper does not evaluate a pre-trained model's capability on a standard benchmark. It uses an LLM (Claude Sonnet) as a reasoning engine applied to internal tabular data for policy selection, not testing the model's knowledge.",
    304           "source": "opus"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Same reasoning: the evaluation is about policy selection on internal Meta experiments, not about whether the LLM has memorized benchmark answers.",
    310           "source": "opus"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "The benchmark consists of internal Meta experiments that would not be in any LLM's training data. Contamination is not a relevant concern here.",
    316           "source": "opus"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participants in this study.",
    360           "source": "opus"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "GEARS uses Claude Sonnet with multiple agent calls, skill activations, and subagent delegations per policy selection task. No API costs, token counts, or latency figures are reported.",
    368           "source": "opus"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No total computational budget is stated — no mention of API spend, total tokens, or wall-clock time for experiments.",
    374           "source": "opus"
    375         }
    376       },
    377       "experimental_rigor": {
    378         "seed_sensitivity_reported": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "No results across multiple random seeds are reported. Table 1 shows single-run results for all methods except Self-Consistency (which samples 5 responses but reports a single aggregated number).",
    382           "source": "opus"
    383         },
    384         "number_of_runs_stated": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "The number of experimental runs is not stated for any method. It appears each method was run once per instruction.",
    388           "source": "opus"
    389         },
    390         "hyperparameter_search_budget": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "No hyperparameter search budget is reported for GEARS or baselines. The tolerance hyperparameter τ and other design choices appear tuned but no search process is documented.",
    394           "source": "opus"
    395         },
    396         "best_config_selection_justified": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "No discussion of how GEARS configuration was selected. The paper presents a single configuration without justifying the choices or reporting alternatives tried.",
    400           "source": "opus"
    401         },
    402         "multiple_comparison_correction": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "12 metrics are reported across 8 methods with no correction for multiple comparisons. No significance tests are performed at all.",
    406           "source": "opus"
    407         },
    408         "self_comparison_bias_addressed": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "The authors built GEARS and compare it against their own implementations of all baselines. This bias is not acknowledged. The baselines may be weaker implementations than the authors' own system.",
    412           "source": "opus"
    413         },
    414         "compute_budget_vs_performance": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "GEARS uses multi-agent orchestration with skills, subagents, and domain knowledge, presumably requiring far more compute than single-call baselines like Naive or CoT. This compute difference is never discussed.",
    418           "source": "opus"
    419         },
    420         "benchmark_construct_validity": {
    421           "applies": true,
    422           "answer": false,
    423           "justification": "The benchmark uses ground-truth defined as top-5 policies by specified optimization criteria. Whether matching these top-5 policies on internal metrics actually leads to better real-world outcomes is not discussed. The construct validity of the benchmark as a proxy for deployment success is unexamined.",
    424           "source": "opus"
    425         },
    426         "scaffold_confound_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "GEARS has access to specialized skills, domain knowledge brain, bash execution, and subagents — a much richer scaffold than baselines which are simple prompting strategies. The comparison conflates the scaffold advantage with the method advantage. This confound is not acknowledged.",
    430           "source": "opus"
    431         }
    432       },
    433       "data_leakage": {
    434         "temporal_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the evaluation data overlaps with development data used to design GEARS's skills and prompts. The system was likely iterated on similar internal experiments before the formal evaluation.",
    438           "source": "opus"
    439         },
    440         "feature_leakage_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "GEARS has access to a 'Domain Knowledge Brain' with historical experiment data. Whether this gives it an unfair advantage by encoding knowledge of the evaluation experiments is not discussed.",
    444           "source": "opus"
    445         },
    446         "non_independence_addressed": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "The 20 experiments come from the same Meta infrastructure. Whether they are independent or share structural similarities (same product surfaces, overlapping user populations) is not discussed.",
    450           "source": "opus"
    451         },
    452         "leakage_detection_method": {
    453           "applies": true,
    454           "answer": false,
    455           "justification": "No leakage detection or prevention methods are applied. No verification that the Domain Knowledge Brain does not contain information about the evaluation experiments.",
    456           "source": "opus"
    457         }
    458       }
    459     }
    460   },
    461   "claims": [
    462     {
    463       "claim": "GEARS achieves 94% nDCG@1 on offline policy selection, outperforming Code-as-Action (77%)",
    464       "evidence": "Table 1 shows GEARS nDCG@1=0.94 vs Code-as-Action 0.77 across 100 evaluation instructions",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Specialized Agent Skills contribute meaningfully to performance gains",
    469       "evidence": "Ablation: GEARS w/o Skill drops to 0.87 nDCG@1 vs 0.94 full, suggesting ~7pp contribution",
    470       "supported": "moderate"
    471     },
    472     {
    473       "claim": "Deterministic filtering (Bash) is critical for stabilizing reasoning",
    474       "evidence": "Ablation: GEARS w/o Bash drops to 0.40 nDCG@1 (70% loss), suggesting most gains come from filtering not agentic reasoning",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "GEARS significantly reduces human engineering overhead",
    479       "evidence": "Mentioned in intro and conclusion but never quantified (no time, cost, or throughput data provided)",
    480       "supported": "unsupported"
    481     },
    482     {
    483       "claim": "Real-world deployment across 9 surfaces shows metric improvements (0.011% to 0.37%)",
    484       "evidence": "Table 3 shows improvements but high variance, many missing metrics per surface, no error bars or significance tests",
    485       "supported": "moderate"
    486     },
    487     {
    488       "claim": "Feature stability governance filters out brittle policies that overfit transient signals",
    489       "evidence": "Figure 4 shows one policy maintained performance over 1 month, but no quantitative before/after on deployment success rate",
    490       "supported": "weak"
    491     },
    492     {
    493       "claim": "Tolerance-based Pareto expansion surfaces practically valuable non-convex policies",
    494       "evidence": "Algorithm 1 described but no empirical validation that non-convex policies deliver better real-world outcomes",
    495       "supported": "weak"
    496     },
    497     {
    498       "claim": "Case study shows GEARS automated multi-week discovery process into agentic workflow",
    499       "evidence": "Section 5.1 anonymized case study describes outcome but provides no timing data or comparison to prior manual process",
    500       "supported": "weak"
    501     }
    502   ],
    503   "methodology_tags": [
    504     "benchmark-eval",
    505     "case-study",
    506     "observational"
    507   ],
    508   "key_findings": "GEARS achieves 94% nDCG@1 on offline policy selection vs. 77% for Code-as-Action baseline, with ablations showing both deterministic filtering and specialized skills contribute to performance. Real-world deployment across 9 Meta product surfaces demonstrates metric improvements ranging from 0.011% to 0.37%, though statistical rigor is unclear and results are heavily anonymized. Ablation analysis suggests that deterministic filtering accounts for ~70% of the gain, raising questions about the contribution of the agentic architecture itself.",
    509   "red_flags": [
    510     {
    511       "flag": "Unreproducible system",
    512       "detail": "No code, data, or detailed specifications released; full system is internal Meta proprietary."
    513     },
    514     {
    515       "flag": "Ablation suggests filtering drives gains",
    516       "detail": "GEARS w/o Bash drops to 0.40 vs 0.94 nDCG@1 (70% loss), implying simple deterministic filtering accounts for majority of improvement over baselines."
    517     },
    518     {
    519       "flag": "Weak real-world evidence",
    520       "detail": "Table 3 improvements range 0.011%-0.37% with no error bars, significance tests, or baseline context; heavily anonymized surfaces prevent verification."
    521     },
    522     {
    523       "flag": "No causality established",
    524       "detail": "No A/B test comparing GEARS vs. human expert selection in production; offline improvements don't prove causal impact on deployed policies."
    525     },
    526     {
    527       "flag": "Model contamination not addressed",
    528       "detail": "Claude Sonnet training cutoff unstated; model likely saw ranking/recommendation research, potentially biasing results in GEARS's favor."
    529     },
    530     {
    531       "flag": "Scope claims exceed evidence",
    532       "detail": "Claims 'general framework' and 'general-purpose mechanism' but only evaluated on Meta's internal systems; no cross-company or external validation."
    533     },
    534     {
    535       "flag": "Undisclosed conflict of interest",
    536       "detail": "Meta employees evaluating Meta's system on Meta products with Meta-deployed solution; conflict not acknowledged."
    537     },
    538     {
    539       "flag": "Undefined key contribution",
    540       "detail": "'Vibe Optimization' poorly defined operationally; unclear how natural-language intent maps to algorithmic constraints."
    541     },
    542     {
    543       "flag": "Missing baselines",
    544       "detail": "No comparison to Bayesian optimization, simpler agent architectures, or existing Meta solutions; only prompt-engineering baselines."
    545     },
    546     {
    547       "flag": "Case study lacks specificity",
    548       "detail": "Section 5.1 anonymized single example; no quantified time savings, cost reduction, or improvement magnitude vs. prior manual process."
    549     },
    550     {
    551       "flag": "High variance in real-world results",
    552       "detail": "Table 3 improvements 0.011%-0.37% across surfaces with no explanation for variation; could be noise or selection bias."
    553     },
    554     {
    555       "flag": "No human evaluation of policy quality",
    556       "detail": "Policies ranked programmatically by nDCG; not validated by domain experts or A/B testing."
    557     }
    558   ],
    559   "cited_papers": [
    560     {
    561       "title": "Chain-of-Thought Prompting Elicits Reasoning in Language Models",
    562       "authors": "Wei et al.",
    563       "year": 2022,
    564       "relevance": "Baseline technique for multi-step reasoning; foundational for agentic reasoning research."
    565     },
    566     {
    567       "title": "Augmented Language Models: A Survey",
    568       "authors": "Mialon et al.",
    569       "year": 2023,
    570       "relevance": "Comprehensive overview of tool-integrated reasoning and agent architectures; directly relevant to GEARS's skill framework."
    571     },
    572     {
    573       "title": "Meta-learners for Estimating Heterogeneous Treatment Effects Using Machine Learning",
    574       "authors": "Künzel et al.",
    575       "year": 2019,
    576       "relevance": "Foundational HTE/uplift modeling; GEARS builds on GAS, which uses similar principles for cohort-level personalization."
    577     },
    578     {
    579       "title": "Adaptive Experimentation Platform (AX)",
    580       "authors": "Bakshy et al.",
    581       "year": 2018,
    582       "relevance": "Prior work on adaptive multi-objective optimization in ranking; GEARS positioned as agentic alternative to bandit/Bayesian methods."
    583     },
    584     {
    585       "title": "GAS: Large-scale Heterogeneous Personalization in Social Network Applications at Meta",
    586       "authors": "Wu et al.",
    587       "year": 2025,
    588       "relevance": "Core technical foundation for GEARS; GAS generates policy candidates that GEARS filters and refines."
    589     },
    590     {
    591       "title": "Why Do Multi-Agent LLM Systems Fail?",
    592       "authors": "Cemri et al.",
    593       "year": 2025,
    594       "relevance": "Addresses reliability of multi-agent systems; directly motivates GEARS's Deterministic Lifecycle Governance approach."
    595     },
    596     {
    597       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    598       "authors": "Shinn et al.",
    599       "year": 2023,
    600       "relevance": "Agent self-refinement technique; relevant to GEARS's iterative policy refinement loop."
    601     },
    602     {
    603       "title": "Uplift Modeling with Multiple Treatments and General Response Types",
    604       "authors": "Zhao et al.",
    605       "year": 2017,
    606       "relevance": "Foundational tree-based uplift method; represents traditional approach that GEARS claims to improve upon."
    607     },
    608     {
    609       "title": "Program of Thoughts Prompting: Disentangling Computation from Reasoning",
    610       "authors": "Chen et al.",
    611       "year": 2022,
    612       "relevance": "Code-generation for reasoning tasks; relevant to GEARS's use of bash/code execution for deterministic filtering."
    613     },
    614     {
    615       "title": "Executable Code Actions Elicit Better LLM Agents",
    616       "authors": "Wang et al.",
    617       "year": 2024,
    618       "relevance": "Code-as-Action baseline; directly compared against GEARS in Table 1."
    619     }
    620   ],
    621   "engagement_factors": {
    622     "practical_relevance": {
    623       "score": 1,
    624       "justification": "Describes an internal Meta framework for ranking optimization that cannot be reproduced externally due to proprietary infrastructure and data."
    625     },
    626     "surprise_contrarian": {
    627       "score": 0,
    628       "justification": "Confirms the expected finding that a heavily engineered agentic system with domain knowledge outperforms vanilla prompting baselines."
    629     },
    630     "fear_safety": {
    631       "score": 0,
    632       "justification": "No safety, security, or risk concerns are raised or relevant to the work."
    633     },
    634     "drama_conflict": {
    635       "score": 0,
    636       "justification": "No controversy, no challenge to existing claims, and no conflict with other work or companies."
    637     },
    638     "demo_ability": {
    639       "score": 0,
    640       "justification": "Entirely proprietary system with no code, demo, or reproducible components available."
    641     },
    642     "brand_recognition": {
    643       "score": 2,
    644       "justification": "From Meta with all 12 authors being Meta employees, though the specific product area (ranking optimization) is not consumer-facing or widely discussed."
    645     }
    646   },
    647   "hn_data": {
    648     "threads": [
    649       {
    650         "hn_id": "47136272",
    651         "title": "Package Managers à la Carte: a formal model of dependency resolution",
    652         "points": 55,
    653         "comments": 17,
    654         "url": "https://news.ycombinator.com/item?id=47136272",
    655         "created_at": "2026-02-24T12:27:44Z"
    656       }
    657     ],
    658     "top_points": 55,
    659     "total_points": 55,
    660     "total_comments": 17
    661   }
    662 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs