scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29321B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Library of LLM Intrinsics for Retrieval-Augmented Generation",
      6     "authors": [
      7       "Marina Danilevsky",
      8       "Kristjan H. Greenewald",
      9       "Chulaka Gunasekara",
     10       "Maeda F Hanafi",
     11       "Lihong He",
     12       "Yannis Katsis",
     13       "Krishnateja Killamsetty",
     14       "Yulong Li",
     15       "Yatin Nandwani",
     16       "Lucian Popa",
     17       "Dinesh Raghu",
     18       "Frederick Reiss",
     19       "Vraj Shah",
     20       "Khoi-Nguyen Tran",
     21       "Huaiyu Zhu",
     22       "Luis Lastras"
     23     ],
     24     "year": 2025,
     25     "venue": "arXiv.org",
     26     "arxiv_id": "2504.11704",
     27     "doi": "10.48550/arXiv.2504.11704"
     28   },
     29   "checklist": {
     30     "claims_and_evidence": {
     31       "abstract_claims_supported": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract claims the library is released as LoRA adapters on HuggingFace with a software interface on vLLM; the paper substantiates this with HuggingFace links, GitHub references, and evaluations for all 8 intrinsics.",
     35         "source": "haiku"
     36       },
     37       "causal_claims_justified": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Causal claims like 'query rewrite improves retrieval recall' are tested via controlled experiments holding the retriever fixed and varying only the query strategy, with multiple benchmarks and baseline comparisons.",
     41         "source": "haiku"
     42       },
     43       "generalization_bounded": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper proposes a general 'software library' concept for LLMs but all implementations are specific to IBM Granite 3.3-8b models; the conclusion does not bound the generalizability of results to other model families or retrieval backends.",
     47         "source": "haiku"
     48       },
     49       "alternative_explanations_discussed": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper does not discuss whether performance gains stem from LoRA fine-tuning specifically, the training data quality, or simply having a purpose-built model vs. prompting; no alternative explanations for the observed improvements are entertained.",
     53         "source": "haiku"
     54       },
     55       "proxy_outcome_distinction": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Metrics are clearly labeled to match what is claimed: Recall@k for retrieval quality, RAGAS Faithfulness for answer faithfulness, and precision/recall/F1 for classification tasks; the paper does not conflate these with higher-order outcomes like user satisfaction.",
     59         "source": "haiku"
     60       }
     61     },
     62     "limitations_and_scope": {
     63       "limitations_section_present": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "There is no dedicated limitations or threats-to-validity section; the paper ends with a conclusion section that does not discuss failure modes or scope restrictions.",
     67         "source": "haiku"
     68       },
     69       "threats_to_validity_specific": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No specific threats are discussed; potential issues like train-test overlap (Context Relevance trained and evaluated on CLAPNQ), self-evaluation bias from IBM evaluating IBM models, or RAGAS-F as an imperfect proxy are not mentioned.",
     73         "source": "haiku"
     74       },
     75       "scope_boundaries_stated": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper does not explicitly state what the results do NOT show; for instance, it does not clarify that performance claims are specific to IBM Granite models or that the 'intrinsic' framework has not been validated across other LLM providers.",
     79         "source": "haiku"
     80       }
     81     },
     82     "conflicts_of_interest": {
     83       "funding_disclosed": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding disclosure appears anywhere in the paper; the acknowledgments section only thanks annotators.",
     87         "source": "haiku"
     88       },
     89       "affiliations_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "All 16 authors are explicitly listed as IBM Research, and the paper is clearly labeled 'IBM Research AI' throughout.",
     93         "source": "haiku"
     94       },
     95       "funder_independent_of_outcome": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "IBM employees are evaluating IBM Granite models and proposing IBM's Granite IO framework as the recommended interface; the funder/employer has a direct commercial interest in the favorable outcome.",
     99         "source": "haiku"
    100       },
    101       "financial_interests_declared": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is included in the paper.",
    105         "source": "haiku"
    106       }
    107     },
    108     "scope_and_framing": {
    109       "key_terms_defined": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The core concept 'LLM intrinsic' is explicitly defined in the introduction as 'a capability that can be invoked through a well-defined API that is reasonably stable and independent of how the LLM intrinsic itself is implemented.'",
    113         "source": "haiku"
    114       },
    115       "intended_contribution_clear": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper clearly states its contribution is a library of 8 RAG intrinsics implemented as LoRA adapters released on HuggingFace and through Granite IO, with training details and evaluations for each.",
    119         "source": "haiku"
    120       },
    121       "engagement_with_prior_work": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper references and builds on prior work including MT-RAG, RAGTruth, ALCE, LongBench-Cite, BEIR, and Thermometer calibration, situating each intrinsic relative to existing methods it outperforms or complements.",
    125         "source": "haiku"
    126       }
    127     }
    128   },
    129   "type_checklist": {
    130     "empirical": {
    131       "artifacts": {
    132         "code_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "LoRA adapters are released on HuggingFace (granite-3.3-8b-rag-agent-lib) and Granite IO software interface is released on GitHub (ibm-granite/granite-io) with example notebooks.",
    136           "source": "haiku"
    137         },
    138         "data_released": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "All evaluation datasets (MT-RAG, BEIR, MMLU, SQuAD, CLAPNQ, etc.) are standard public benchmarks used unmodified; training data for query rewrite is proprietary but evaluation data is public.",
    142           "source": "haiku"
    143         },
    144         "environment_specified": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "The paper mentions vLLM as the inference platform and PEFT for fine-tuning but provides no requirements file, Dockerfile, or specific version numbers for the software environment.",
    148           "source": "haiku"
    149         },
    150         "reproduction_instructions": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper redirects readers to external HuggingFace documentation and GitHub notebooks rather than providing step-by-step reproduction instructions within the paper itself.",
    154           "source": "haiku"
    155         }
    156       },
    157       "statistical_methodology": {
    158         "confidence_intervals_or_error_bars": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "All results in all tables are reported as point estimates only; no confidence intervals, error bars, or standard deviations are provided for any metric.",
    162           "source": "haiku"
    163         },
    164         "significance_tests": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No statistical significance tests are used for any of the comparative claims despite tables showing close performance differences (e.g., LoRA vs. Mixtral in Table 2).",
    168           "source": "haiku"
    169         },
    170         "effect_sizes_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "The paper reports percentage-point improvements throughout, e.g., '9 percentage points jump in Recall@20' and '22 percentage points improvement on non-standalone subset', with baselines for context.",
    174           "source": "haiku"
    175         },
    176         "sample_size_justified": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "The benchmark sizes (842 MT-RAG points, specific BEIR subsets) are inherited from existing benchmarks without any power analysis or justification for whether these are sufficient.",
    180           "source": "haiku"
    181         },
    182         "variance_reported": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "No variance, standard deviation, or results across multiple runs are reported; all metrics appear to be single-run point estimates.",
    186           "source": "haiku"
    187         }
    188       },
    189       "evaluation_design": {
    190         "baselines_included": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Every intrinsic is compared against multiple baselines: no-rewrite/vanilla Granite instruct, Mixtral, Llama-3.3-70B, Granite Guardian, and sometimes gold human references.",
    194           "source": "haiku"
    195         },
    196         "baselines_contemporary": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Baselines include Llama-3.3-70B-Instruct, Mixtral-8x22B-Instruct, and Granite 3.3-8b-instruct, all current competitive models at time of writing.",
    200           "source": "haiku"
    201         },
    202         "ablation_study": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Section 10 systematically evaluates four composite flows (None, QR only, AD only, QR+AD) measuring the contribution of each component to answerability classification, faithfulness, and JAFS.",
    206           "source": "haiku"
    207         },
    208         "multiple_metrics": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper uses Recall@k, RAGAS Faithfulness, RAD-Bench, NDCG@10, precision/recall/F1 by class, ECE, MAE, and the joint JAFS metric across different intrinsics.",
    212           "source": "haiku"
    213         },
    214         "human_evaluation": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "No human evaluation of system outputs is conducted; gold rewrites are used as reference baselines but do not constitute human evaluation of the intrinsic outputs.",
    218           "source": "haiku"
    219         },
    220         "held_out_test_set": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Evaluation is performed on held-out test splits of established benchmarks (MMLU, SQuAD, MT-RAG eval split, BEIR test sets) separate from training data.",
    224           "source": "haiku"
    225         },
    226         "per_category_breakdown": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Context Relevance results are broken down by 8 individual datasets (Tables 10-11); Passage Reranking results show per-task NDCG@10 across 15 BEIR subtasks (Table 15); UQ ECE is shown per MMLU task (Figure 1).",
    230           "source": "haiku"
    231         },
    232         "failure_cases_discussed": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "The paper notes potential confusing composite behaviors (high UQ + low HD scores) conceptually but does not show or analyze any actual failure cases or error examples.",
    236           "source": "haiku"
    237         },
    238         "negative_results_reported": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Table 15 shows Passage Reranking hurts performance on several BEIR tasks (touche2020, scidocs, arguana, quora, fever, climatefever); the paper acknowledges this and offers a hypothesis about annotation quality.",
    242           "source": "haiku"
    243         }
    244       },
    245       "setup_transparency": {
    246         "model_versions_specified": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Exact model versions are specified: ibm-granite/granite-3.3-8b-instruct, meta-llama/Llama-3.3-70B-Instruct, Mixtral 8x7b, ibm-granite/granite-guardian-3.1-5b, etc.",
    250           "source": "haiku"
    251         },
    252         "prompts_provided": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Actual prompts are provided for several intrinsics including the full rewrite role prompt (Section 2.1), the passage reranking default prompt (Section 6.1), and hallucination detection instruction string (Section 8.1).",
    256           "source": "haiku"
    257         },
    258         "hyperparameters_reported": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "LoRA hyperparameters are reported for each adapter: rank (8, 16, or 32), learning rate (1e-5 to 5e-6), number of epochs (25), and train/validation split (90/10).",
    262           "source": "haiku"
    263         },
    264         "scaffolding_described": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Section 10 and Figure 2 describe the composite intrinsic flows in detail including how QR and AD chain together, what triggers the 'I don't know' response, and how retrieved passages flow between components.",
    268           "source": "haiku"
    269         },
    270         "data_preprocessing_documented": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Preprocessing steps are documented: NLTK sentence splitting for citation/hallucination tasks, Mixtral as automatic judge for label validation, apply_chat_template for model inputs, and 90/10 train/val splits.",
    274           "source": "haiku"
    275         }
    276       },
    277       "data_integrity": {
    278         "raw_data_available": {
    279           "applies": true,
    280           "answer": false,
    281           "justification": "Only aggregate metrics are reported; individual model predictions, intermediate outputs, or per-example results are not released for independent verification.",
    282           "source": "haiku"
    283         },
    284         "data_collection_described": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Training data construction is described for each adapter (e.g., human-annotated Cloud corpus for QR, synthetically generated via Mixtral for context relevance, CoQA/MultiDoc2Dial/QuAC as seed corpora for HD and CG).",
    288           "source": "haiku"
    289         },
    290         "recruitment_methods_described": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "Standard public benchmarks are used for evaluation; no participant recruitment was involved.",
    294           "source": "haiku"
    295         },
    296         "data_pipeline_documented": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "The multi-step data pipeline is described for training data: corpus selection, conversation generation, label generation (via Mixtral judge), filtering, LoRA fine-tuning with PEFT.",
    300           "source": "haiku"
    301         }
    302       },
    303       "contamination": {
    304         "training_cutoff_stated": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The base model (Granite 3.3-8b-instruct) training data cutoff is not stated, making it impossible to assess whether evaluation benchmarks were in the pre-training data.",
    308           "source": "haiku"
    309         },
    310         "train_test_overlap_discussed": {
    311           "applies": true,
    312           "answer": false,
    313           "justification": "Context Relevance is trained on CLAPNQ data and evaluated on CLAPNQ; Answerability Determination is trained on the MT-RAG Government corpus and evaluated on MT-RAG benchmark; this overlap is never discussed.",
    314           "source": "haiku"
    315         },
    316         "benchmark_contamination_addressed": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No discussion of whether the base Granite model's pre-training included evaluation benchmark examples (MMLU, SQuAD, BEIR datasets are widely used and likely present in pre-training data).",
    320           "source": "haiku"
    321         }
    322       },
    323       "human_studies": {
    324         "pre_registered": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "irb_or_ethics_approval": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "demographics_reported": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "inclusion_exclusion_criteria": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "randomization_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         },
    354         "blinding_described": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants in this study.",
    358           "source": "haiku"
    359         },
    360         "attrition_reported": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants in this study.",
    364           "source": "haiku"
    365         }
    366       },
    367       "cost_and_practicality": {
    368         "inference_cost_reported": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "A qualitative note about passage reranking latency ('only needs 1 preference token') is provided but no systematic latency or cost benchmarks are reported for any intrinsic.",
    372           "source": "haiku"
    373         },
    374         "compute_budget_stated": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "No GPU hours, compute budget, or training time is reported for any of the LoRA fine-tuning runs.",
    378           "source": "haiku"
    379         }
    380       }
    381     }
    382   },
    383   "claims": [
    384     {
    385       "claim": "Query Rewrite LoRA adapter achieves 9pp improvement in Recall@20 over no-rewrite on the full MT-RAG dataset (22pp improvement on the non-standalone subset).",
    386       "evidence": "Tables 2-4 compare Recall@5/10/20 across no-rewrite, Mixtral, Granite LoRA, and gold rewrite strategies on full, non-standalone, and standalone MT-RAG subsets.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Citation Generation LoRA (8b) achieves comparable or better citation F1 than prompted Llama-3.1-70B-Instruct (70b) on ALCE and significantly outperforms it on LongBench-Cite (68.6 vs 42.0 avg F1).",
    391       "evidence": "Tables 18-19 compare Recall/Precision/F1 for the LoRA adapter vs. much larger models on two citation benchmarks; LongBench-Cite shows a 26.6pp avg F1 advantage.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Context Relevance LoRA improves precision on relevant labels across all 8 evaluated datasets vs. Llama-3.3-70B, Granite Guardian, and vanilla Granite instruct.",
    396       "evidence": "Table 10 shows LoRA precision consistently at or above 0.92 across all datasets while Granite Guardian drops as low as 0.08 and vanilla Granite instruct as low as 0.60.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Uncertainty Quantification LoRA achieves an average ECE of 0.064 across MMLU tasks without degrading zero-shot accuracy (avg 89%), compared to base model ECE of 0.20.",
    401       "evidence": "Section 7.2 and Figure 1 report ECE per MMLU task for the LoRA vs. base Granite 3.3-8b-instruct; the LoRA ECE is smaller than the 10% quantization gap.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Composite QR+AD flow achieves better overall performance (JAFS=66) than either QR alone (56) or no intrinsic (49), matching AD alone (66) while also improving faithfulness.",
    406       "evidence": "Tables 20-22 decompose answerability classification F1, RAGAS faithfulness, and JAFS for all four flows; QR+AD matches AD on JAFS while improving answerable query handling.",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "LLM intrinsics can be implemented as a stable, model-agnostic API analogous to a software library, enabling modular RAG pipelines independent of backend implementation.",
    411       "evidence": "Conceptual argument in Sections 1 and 11; the claim is not empirically tested across multiple LLM providers — all implementations use IBM Granite models only.",
    412       "supported": "weak"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval",
    417     "case-study"
    418   ],
    419   "key_findings": "The paper introduces 8 RAG intrinsics as LoRA adapters for IBM Granite 3.3-8b, released on HuggingFace under Apache 2.0. Fine-tuned small LoRA adapters (8b) consistently match or outperform prompted much larger models (70b) on RAG-specific tasks including citation generation, context relevance, and answerability determination, with Citation Generation achieving 68.6 vs. 42.0 avg F1 on LongBench-Cite against prompted Llama-3.1-70B. The composite intrinsic evaluation demonstrates that simple accuracy metrics can be misleading for composite flows — the joint answerability-faithfulness score (JAFS) is needed to see that QR+AD (66) outperforms QR alone (56) even when RAGAS-F alone suggests QR is superior. Notable contamination concern: several adapters are trained and evaluated on overlapping data (CLAPNQ, MT-RAG) without discussion.",
    420   "red_flags": [
    421     {
    422       "flag": "Train-test contamination undisclosed",
    423       "detail": "Context Relevance is trained on CLAPNQ (Section 4.3) and evaluated on CLAPNQ (Table 10-11). Answerability Determination is trained on the MT-RAG Government corpus (Section 5.3) and evaluated on the MT-RAG Benchmark (Table 13). This overlap is never acknowledged or discussed."
    424     },
    425     {
    426       "flag": "Self-evaluation: IBM evaluates IBM products",
    427       "detail": "All 16 authors are IBM Research employees evaluating IBM Granite models with IBM Granite IO as the recommended deployment framework, with no independent third-party evaluation and no funding disclosure."
    428     },
    429     {
    430       "flag": "No confidence intervals or significance tests",
    431       "detail": "All performance numbers are point estimates across tables; differences of 1-2 percentage points are discussed as meaningful improvements without any statistical testing."
    432     },
    433     {
    434       "flag": "No limitations section",
    435       "detail": "The paper has no dedicated limitations discussion; failure modes, scope restrictions, or cases where the intrinsics would not be expected to work are not addressed."
    436     },
    437     {
    438       "flag": "Proprietary training data for key intrinsic",
    439       "detail": "Query Rewrite training data is described as proprietary, obtained through a third-party annotation company (Section 2.3), preventing reproduction or independent assessment of data quality."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "MTRAG: A Multi-Turn Conversational Benchmark for Evaluating Retrieval-Augmented Generation Systems",
    445       "relevance": "Primary evaluation benchmark used across multiple intrinsics; provides MT-RAG dataset for QR, AD, and composite flow evaluation"
    446     },
    447     {
    448       "title": "Enabling Large Language Models to Generate Text with Citations (ALCE)",
    449       "relevance": "Citation generation benchmark used to evaluate the Citation Generation intrinsic against prompted larger models"
    450     },
    451     {
    452       "title": "LongCite: Enabling LLMs to Generate Fine-Grained Citations in Long-Context QA (LongBench-Cite)",
    453       "relevance": "Second citation benchmark showing LoRA significantly outperforms prompted 70B models on span-level citations"
    454     },
    455     {
    456       "title": "RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models",
    457       "relevance": "Benchmark used to evaluate Hallucination Detection intrinsic; provides baseline results from the original paper"
    458     },
    459     {
    460       "title": "BEIR: A Heterogeneous Benchmark for Zero-Shot Evaluation of Information Retrieval Models",
    461       "relevance": "Used to evaluate Passage Reranking intrinsic across 15 retrieval tasks including NDCG@10"
    462     },
    463     {
    464       "title": "Thermometer: Towards Universal Calibration for Large Language Models",
    465       "relevance": "The calibration method that UQ LoRA is trained to mimic; foundational methodology for the Uncertainty Quantification intrinsic"
    466     },
    467     {
    468       "title": "CLAPnq: Cohesive Long-Form Answers from Passages in Natural Questions for RAG Systems",
    469       "relevance": "Dataset used for both training and evaluation of Context Relevance intrinsic (raises contamination concern)"
    470     },
    471     {
    472       "title": "Activated LoRA: Fine-tuned LLMs for Intrinsics",
    473       "relevance": "Sister paper describing the activated LoRA mechanism as an efficient implementation approach for LLM intrinsics"
    474     },
    475     {
    476       "title": "Know What You Don't Know: Unanswerable Questions for SQuAD (SQuADRun)",
    477       "relevance": "Benchmark for evaluating Answerability Determination in single-turn setting"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "LoRA adapters and Granite IO are publicly released under Apache 2.0; practitioners can immediately deploy these 8 RAG intrinsics via HuggingFace or GitHub notebooks."
    484     },
    485     "surprise_contrarian": {
    486       "score": 1,
    487       "justification": "Fine-tuned small models outperforming prompted large models is a well-established finding; no unexpected or counter-intuitive results are presented."
    488     },
    489     "fear_safety": {
    490       "score": 1,
    491       "justification": "The hallucination detection and answerability determination intrinsics address reliability concerns, but no AI risk framing is used."
    492     },
    493     "drama_conflict": {
    494       "score": 0,
    495       "justification": "No controversy, no challenge to dominant paradigms, no conflict with other researchers."
    496     },
    497     "demo_ability": {
    498       "score": 3,
    499       "justification": "Code, models, and notebooks are publicly available on HuggingFace and GitHub; anyone can run the intrinsics immediately with vLLM and Granite IO."
    500     },
    501     "brand_recognition": {
    502       "score": 2,
    503       "justification": "IBM Research is a recognizable AI lab and IBM Granite is a known open model family, though neither is at the same recognition level as OpenAI, Google, or Meta."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "44770561",
    510         "title": "B-Splines and Fourier-Best Friends for Spatial-Temporal Video Super-Resolution",
    511         "points": 4,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=44770561"
    514       },
    515       {
    516         "hn_id": "43872030",
    517         "title": "Linguistics Learned to Stop Worrying and Love the Language Models",
    518         "points": 3,
    519         "comments": 1,
    520         "url": "https://news.ycombinator.com/item?id=43872030"
    521       },
    522       {
    523         "hn_id": "40106382",
    524         "title": "Automated Social Science: Language Models as Scientist and Subjects",
    525         "points": 3,
    526         "comments": 1,
    527         "url": "https://news.ycombinator.com/item?id=40106382"
    528       },
    529       {
    530         "hn_id": "43773523",
    531         "title": "Robotic Squirrel Pinto: A latched spring actuated robot for jumping and perching",
    532         "points": 2,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=43773523"
    535       },
    536       {
    537         "hn_id": "45259423",
    538         "title": "Human+AI loops stay stable even with quantization",
    539         "points": 2,
    540         "comments": 1,
    541         "url": "https://news.ycombinator.com/item?id=45259423"
    542       },
    543       {
    544         "hn_id": "46145768",
    545         "title": "Conversational Networks",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=46145768"
    549       },
    550       {
    551         "hn_id": "44739937",
    552         "title": "Double Duty: FPGA Architecture to Enable Concurrent LUT and Adder Chain Usage",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=44739937"
    556       },
    557       {
    558         "hn_id": "44375110",
    559         "title": "Gender and Positional Biases in LLM-Based Hiring Decisions: Evidence Found",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=44375110"
    563       },
    564       {
    565         "hn_id": "44242780",
    566         "title": "Talk to Your Slides: Efficient Slide Editing Agent with Large Language Models",
    567         "points": 1,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=44242780"
    570       },
    571       {
    572         "hn_id": "43843140",
    573         "title": "Physical Principles of Quantum Biology",
    574         "points": 1,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=43843140"
    577       }
    578     ],
    579     "top_points": 4,
    580     "total_points": 22,
    581     "total_comments": 3
    582   }
    583 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs