scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28452B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Hint-Augmented Re-ranking: Efficient Product Search using LLM-Based Query Decomposition",
      6     "authors": [
      7       "Yilun Zhu",
      8       "Nikhita Vedula",
      9       "S. Malmasi"
     10     ],
     11     "year": 2025,
     12     "venue": "IJCNLP-AACL",
     13     "arxiv_id": "2511.13994",
     14     "doi": "10.48550/arXiv.2511.13994"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The 10.9 MAP improvement is supported by Table 2 (QE-BM25 35.74 vs BM25 24.79), and the 5.9 MRR improvement is supported (3B+H 74.74 vs 3B 68.82), though these compare against different baselines in different experimental conditions.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses direct ablation (same model with and without hints at identical training conditions) to establish the contribution of hints, and validates with human evaluation (65.38% win rate, kappa=0.74) confirming the causal claim holds beyond LLM-generated labels.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The conclusion claims to advance 'linguistic interpretation in retrieval systems' broadly, but evaluation is restricted to English Amazon e-commerce superlative queries only; the limitations section acknowledges this but the main conclusions are still framed too broadly.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not discuss whether improvements stem from longer input context (vs semantic knowledge transfer), data augmentation effects from LLM-enriched training text, or the circularity of Claude-generated labels favoring Claude-generated hints; the Nova Pro re-evaluation only partially addresses the last concern.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Primary automatic evaluation uses relevance labels generated by Claude 3.5 Sonnet v2 — the same model used to generate hints — creating a circular measurement where LLM agreement proxies for user satisfaction; the 153-query human evaluation is too small to anchor the main conclusions.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "There is a dedicated 'Limitations' section following the conclusion that discusses model diversity, English-only scope, and directions for extension.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Limitations are specific: only two open-source LLMs tested for listwise (no commercial models like o1), pointwise restricted to Qwen family, English-only queries with explicitly noted multilingual gap; these are concrete rather than generic disclaimers.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The limitations section explicitly states the approach is restricted to English queries, that model diversity is limited to Qwen for pointwise reranking, and that non-superlative queries are out of scope.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding disclosure is present; the work is from Amazon employees but there is no explicit statement about funding source or internal research budget.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list Amazon.com, Inc. as their affiliation with institutional email addresses disclosed.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "Amazon employees are evaluating techniques for Amazon product search using the Amazon Shopping Queries Dataset; the employer (Amazon) directly benefits from the research outcomes, making independence impossible.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "There is no competing interests statement, patent disclosure, or equity declaration anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: 'hints' (attribute-value interpretations generated from superlatives), three relevance categories (RELEVANT AND BEST, RELEVANT BUT NOT BEST, IRRELEVANT) are defined with specific criteria in Appendix E, and 'superlative expressions' are illustrated with concrete examples.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three explicit contributions are stated: (i) a 21K-query/470K-product dataset, (ii) an efficient hint-augmented ranking method, and (iii) evidence that small models enhanced with hints surpass large models.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper engages substantively with prior LLM reranking work (Sun 2023, Qin 2024), query expansion methods (Jagerman 2023, Wang 2023), and prior superlative interpretation work (Bos & Nissim 2006, Pyatkin 2025), explicitly positioning the contribution relative to each.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper states 'Code and data is available at: https://github.com/yilunzhu/superhints/' in the introduction footnote.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The GitHub link includes both code and data; the base Amazon Shopping Queries Dataset is also publicly available from the KDD 2022 Cup.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "GPU types and AWS instance types are specified (L40s, A100), but no requirements.txt, Dockerfile, or explicit library version list is mentioned in the paper.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Training hyperparameters and the pipeline are described, but there are no step-by-step reproduction instructions in the paper; reproduction depends entirely on what may be available in the GitHub repo.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Table 2 and Table 4 are single point estimates with no confidence intervals or error bars reported.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are reported for any of the comparative ranking results despite multiple comparative claims being made.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage point improvements are reported throughout with baseline context (e.g., '+6.2 points P@1' for 3B+H over 3B, '+10.5 points over doc2query'), providing interpretable effect sizes.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The test set size of 5,353 queries is described but not justified; no power analysis or discussion of whether this is sufficient to detect meaningful differences is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or repeated-run statistics are reported for ranking metrics; Table 3 reports latency percentiles (p5/p95) but not ranking metric variance.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple baselines included: vanilla BM25, doc2query, QE-BM25, dense retriever alone, and listwise LLM rerankers (Qwen2.5-72B, DeepSeek-R1).",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include DeepSeek-R1 (2025) and Qwen2.5-72B-Instruct (2024), both state-of-the-art models at time of writing.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Direct ablation comparing models with and without hints at both 0.5B and 3B sizes (Table 2), plus comparison of different hint generation strategies in Appendix D.3 (feature augmentation vs query reformulation).",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Six evaluation metrics are reported: P@1, P@3, P@5, P@10, MAP, and MRR across all conditions.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Human evaluation by 10 expert annotators on 153 randomly sampled queries in a blind setting, with Cohen's kappa of 0.74 reported as inter-annotator agreement.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The dataset is stratified into train/dev/test splits (Table 1) and results are reported on the held-out test set of 5,353 queries.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 8 provides performance breakdown across 16 product categories for the best model, with MRR ranging from 64.07 to 81.73.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Appendix Table 9 shows three failure categories of the baseline (brand recognition errors, feature interpretation errors, relevancy assessment errors) with concrete product-level examples.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "doc2query shows only 'modest improvement' over vanilla BM25, and listwise Qwen2.5-72B shows 'limited effectiveness' (35.31 P@1); these are explicitly noted and not buried.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model versions are stated: Claude 3.5 Sonnet v2, Claude 3.7 Sonnet, gte-Qwen2-7B-instruct, qwen2.5-0.5B-instruct, qwen2.5-3B-instruct, qwen2.5-72B-instruct, DeepSeek-R1.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full prompts are provided in Appendix E (Tables 12-15) for query generation, relevance annotation, hint generation, and listwise ranking, with complete text including all instructions.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Learning rate (1e-5), batch sizes (32 for 0.5B, 2 for 3B), FP16 precision, early stopping on validation, and training time (3-4 hours on L40s) are all reported in Appendix A.1.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The pipeline architecture is described: parallel hint generation concurrent with retrieval, chunking strategy for listwise (5 chunks of 10 products), zero-shot prompting rationale, and the QE-BM25 averaging algorithm (Appendix C.1).",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3 documents the full preprocessing pipeline: subcategory filtering (≥300 products), Claude query generation, BM25 no-result filtering, dense retrieval for top-50 candidates, and filtering queries with 0 or ≥15 relevant products.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The Amazon Shopping Queries Dataset (KDD 2022 Cup) is publicly available, and the generated superlative query dataset is linked at the GitHub repository.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes the full collection procedure: source dataset, subcategory extraction, Claude-based query generation (50 queries per subcategory), BM25 filtering, dense retrieval for candidate products, and stratified splitting by label distribution and parent category.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "The 10 human annotators are described only as 'expert annotators' with no information on recruitment criteria, compensation, qualifications, or whether they are Amazon employees.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The complete pipeline from Amazon Shopping Queries Dataset → subcategory filtering → Claude query generation → BM25 no-result filtering → dense retrieval → LLM relevance labeling → stratified splitting is fully documented across Section 3.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training cutoffs for Claude 3.5 Sonnet v2 or Claude 3.7 Sonnet are not stated; the Amazon Shopping Queries Dataset has been public since 2022 and LLM training data overlap is not discussed.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether the Amazon Shopping Queries Dataset or the derived superlative queries appeared in any model's training data.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "The Amazon Shopping Queries Dataset has been publicly available since 2022 and could have been included in LLM training corpora; this is not discussed or acknowledged.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "No pre-registration is mentioned for the human evaluation study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "No IRB or ethics approval is mentioned for the human evaluation involving the 10 annotators.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "Annotators are described only as 'ten expert annotators' with no demographic information, background, or domain expertise reported.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "No inclusion or exclusion criteria are stated for the annotators; the basis for calling them 'expert' is not defined.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": true,
    337           "answer": true,
    338           "justification": "The 153 evaluation queries are described as 'randomly sampled' from the test set, providing adequate description of the sampling randomization.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": true,
    343           "answer": true,
    344           "justification": "Blinding is explicitly described: 'evaluation was conducted in a blind setting, with annotators unaware of which system was which.'",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No attrition or dropout information is reported for the 10 annotators.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "End-to-end latency per query is reported in Table 3 (seconds, with p5/p95 percentiles), PFLOPs are reported in Figure 2, and AWS GPU hourly costs are stated ($30.13/hr for L40s, $27.45/hr for A100).",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Training time is specified as 3-4 hours on a single NVIDIA L40s GPU; with the stated hourly cost this is estimable, and the GPU types/counts for all experiments are described in Appendix A.1.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Hint-augmented QE-BM25 improves MAP by 10.9 points over vanilla BM25 (35.74 vs 24.79)",
    373       "evidence": "Table 2 reports MAP of 35.74 for QE-BM25 vs 24.79 for BM25 on the 5,353-query test set",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Small hint-augmented 3B model (64.39 P@1) surpasses 72B listwise ranker (35.31 P@1)",
    378       "evidence": "Table 2 shows qwen2.5-3B-instruct P+H achieves 64.39 P@1 vs qwen2.5-72B-instr listwise at 35.31 P@1",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Hint generation introduces only 3.5-6.8% latency overhead via parallel execution with retrieval",
    383       "evidence": "Table 3 shows 149ms overhead for 0.5B+H and 473ms for 3B+H over 3.688s dense retrieval baseline",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Hint-augmented models achieve 14x efficiency advantage over listwise LLMs",
    388       "evidence": "Appendix D.2 reports 0.122-0.224 PFLOPs for hint models vs 1.200-1.720 PFLOPs for listwise baselines",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Human evaluation confirms a 65.38% win rate for hint-augmented model (excluding ties)",
    393       "evidence": "153 queries assessed by 10 annotators: 51 wins for hint model, 27 for baseline, 75 ties; kappa=0.74",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Relevance label generation using Claude does not introduce judge-specific bias",
    398       "evidence": "Amazon Nova Pro replication (Table 4) shows consistent relative performance patterns, though absolute scores differ substantially",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "LLM-generated superlative query dataset has 94.6% validity as judged by human annotators",
    403       "evidence": "946/1000 randomly selected queries judged as natural and representative by a single annotator (Appendix B.2)",
    404       "supported": "weak"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "case-study"
    410   ],
    411   "key_findings": "The paper demonstrates that LLM-generated semantic hints (brands, features, and query reformulations) extracted from superlative queries can be transferred offline to small language models, enabling 3B parameter models to outperform 72B listwise rankers (64.39 vs 35.31 P@1) at 14x lower computational cost. Parallel hint generation introduces only 3.5-6.8% latency overhead over dense retrieval. Human evaluation with Cohen's kappa of 0.74 across 10 annotators confirms a 65.38% win rate for the hint-augmented system. The approach is specific to English Amazon e-commerce superlative queries and relies on LLM-generated relevance labels, which create a circularity concern that is only partially mitigated by an alternative judge evaluation.",
    412   "red_flags": [
    413     {
    414       "flag": "Circular evaluation",
    415       "detail": "Relevance labels are generated by Claude 3.5 Sonnet v2, the same model used to generate hints; models trained on Claude-hint-enriched text are evaluated against Claude-generated labels, creating a circular measurement system that favors hint-augmented models."
    416     },
    417     {
    418       "flag": "Mixed-baseline abstract claims",
    419       "detail": "The abstract's '10.9 MAP and 5.9 MRR over baselines' conflates two different experiments with different baselines (retrieval QE-BM25 vs BM25 for MAP; reranking hint vs no-hint for MRR), creating a misleading impression of unified improvement."
    420     },
    421     {
    422       "flag": "No significance tests",
    423       "detail": "All performance comparisons are reported as point differences without statistical significance tests, making it impossible to assess whether improvements exceed noise on the 5,353 test queries."
    424     },
    425     {
    426       "flag": "Undisclosed annotator identity",
    427       "detail": "The 10 'expert annotators' for human evaluation are likely Amazon employees with knowledge of the Amazon product catalog, which could bias preference toward Amazon-brand-aware judgments; this is not disclosed."
    428     },
    429     {
    430       "flag": "Dataset quality validated by single annotator",
    431       "detail": "The 94.6% query validity claim (Appendix B.2) is based on evaluation by a single annotator on 1,000 of 21,407 queries, with no inter-annotator agreement reported for the quality evaluation itself."
    432     },
    433     {
    434       "flag": "Amazon conflict of interest undisclosed",
    435       "detail": "All authors are Amazon employees evaluating a technique for Amazon search using Amazon's dataset; no competing interests statement is included despite direct commercial relevance."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Is ChatGPT good at search? Investigating large language models as re-ranking agents",
    441       "relevance": "Establishes LLMs as capable zero-shot rerankers; foundational baseline for the LLM reranking approach"
    442     },
    443     {
    444       "title": "Large language models are effective text rankers with pairwise ranking prompting",
    445       "relevance": "Key baseline for pairwise/listwise LLM ranking; the chunked listwise approach in this paper addresses the O(N) inference cost noted here"
    446     },
    447     {
    448       "title": "Shopping Queries Dataset: A large-scale ESCI benchmark for improving product search",
    449       "relevance": "Source dataset for constructing the superlative query benchmark used in all experiments"
    450     },
    451     {
    452       "title": "Query expansion by prompting large language models",
    453       "relevance": "Direct baseline for LLM-based query expansion; this paper builds on and surpasses this approach specifically for superlative queries"
    454     },
    455     {
    456       "title": "Query2doc: Query expansion with large language models",
    457       "relevance": "Alternative query expansion baseline compared against in retrieval experiments"
    458     },
    459     {
    460       "title": "Superlatives in context: Modeling the implicit semantics of superlatives",
    461       "relevance": "Directly establishes that superlative interpretation remains challenging for advanced LLMs including GPT-4; motivates the problem"
    462     },
    463     {
    464       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    465       "relevance": "Methodology underpinning the CoT-based hint generation approach used to decompose superlatives into structured attributes"
    466     },
    467     {
    468       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    469       "relevance": "Key reasoning model baseline used as a strong listwise reranker comparison"
    470     },
    471     {
    472       "title": "Lost in the middle: How language models use long contexts",
    473       "relevance": "Motivates the chunking strategy for listwise reranking to avoid context length problems"
    474     },
    475     {
    476       "title": "Generative product recommendations for implicit superlative queries",
    477       "relevance": "Most closely related prior work on superlative query handling in e-commerce, with limitations that motivated this paper's dataset construction"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "Directly deployable in e-commerce search with demonstrated latency metrics, code release, and explicit addressing of production constraints including parallel execution and caching strategies."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "The result that a 3B fine-tuned model with hints outperforms a 72B listwise ranker is counterintuitive and challenges the assumption that larger is always better for reasoning-intensive ranking."
    488     },
    489     "fear_safety": {
    490       "score": 0,
    491       "justification": "No AI risk or safety concerns raised; this is a product search optimization paper."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Mild tension with prior query expansion work (doc2query shown as limited for superlative queries), but no strong controversy or community conflict angle."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "Code and data released on GitHub, making the approach testable; however, the Amazon Shopping Queries Dataset requires agreement with Amazon's terms."
    500     },
    501     "brand_recognition": {
    502       "score": 2,
    503       "justification": "Amazon-affiliated paper with use of Claude (Anthropic) and evaluation on Amazon's own dataset; not a frontier lab publication but carries Amazon's brand weight in e-commerce search."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "25277281",
    510         "title": "Who Is Debugging the Debuggers? Exposing Debug Bugs in Optimized Binaries",
    511         "points": 98,
    512         "comments": 22,
    513         "url": "https://news.ycombinator.com/item?id=25277281",
    514         "created_at": "2020-12-02T15:38:48Z"
    515       },
    516       {
    517         "hn_id": "46246586",
    518         "title": "Can You Learn to See Without Images? Procedural Warm-Up for Vision Transformers",
    519         "points": 2,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=46246586",
    522         "created_at": "2025-12-12T17:50:09Z"
    523       },
    524       {
    525         "hn_id": "42916126",
    526         "title": "Fanar: An Arabic-Centric Multimodal Generative AI Platform",
    527         "points": 2,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=42916126",
    530         "created_at": "2025-02-03T08:28:03Z"
    531       },
    532       {
    533         "hn_id": "25283677",
    534         "title": "Who’s Debugging the Debuggers? Exposing Bugs in Debug Information",
    535         "points": 1,
    536         "comments": 1,
    537         "url": "https://news.ycombinator.com/item?id=25283677",
    538         "created_at": "2020-12-03T00:21:15Z"
    539       }
    540     ],
    541     "top_points": 98,
    542     "total_points": 103,
    543     "total_comments": 24
    544   }
    545 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs