scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28762B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Gorilla: Large Language Model Connected with Massive APIs",
      6     "authors": [
      7       "Shishir G. Patil",
      8       "Tianjun Zhang",
      9       "Xin Wang",
     10       "Joseph E. Gonzalez"
     11     ],
     12     "year": 2023,
     13     "venue": "Neural Information Processing Systems",
     14     "arxiv_id": "2305.15334",
     15     "doi": "10.52202/079017-4020"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims Gorilla surpasses GPT-4 on API calls and reduces hallucination; Table 1 supports this numerically (e.g., 59.13% vs 38.70% zero-shot TorchHub accuracy, 6.98% vs 36.55% hallucination rate).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Claims that fine-tuning improves accuracy and retriever-aware training enables adaptation are tested through direct ablations comparing Gorilla with/without retrieval and against baseline models on the same benchmark.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper tests only ML model hub APIs but makes broader claims about LLMs using 'tools' and mentions RESTful APIs as future scope without empirical support; the title 'Massive APIs' implies broader applicability than the ML-only evaluation supports.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss that Gorilla's advantage may stem from memorizing specific API signatures from its training split rather than genuinely learning tool use, nor does it address whether GPT-4 is disadvantaged by never seeing the specific model hub API format.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly defines 'hallucination' as an API call not found in the dataset and 'accuracy' via AST subtree matching, clearly distinguishing the measurement from broader claims about tool use reliability.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 is titled 'Limitations & Social Impacts' and includes domain-specific limitations discussion.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The limitations section only mentions potential bias from ML-domain APIs and doesn't address key threats: the test set is derived from the same GPT-4-generated instruction pool as training, or that HuggingFace evaluation uses different metrics for Gorilla vs baselines.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper notes ML APIs were chosen for challenge but does not explicitly bound claims to that domain; the conclusion and abstract imply generalizability to broader tool use without stating what the results do NOT show.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgment section explicitly lists funders: 'gifts to UC Berkeley Sky Computing Lab from Astronomer, Google, IBM, Intel, Lacework, Microsoft, Nexla, Samsung SDS, Uber, and VMware.'",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly stated: UC Berkeley (Patil, Zhang, Gonzalez) and Microsoft Research (Wang).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Microsoft Research is both a funder and has a co-author (Xin Wang); Microsoft is thus not independent of the outcome even though the evaluated model is LLaMA-based rather than a Microsoft product.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no competing interests statement, no disclosure of patents, equity, or consulting arrangements; only funding acknowledgment is provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'Hallucination' is precisely defined as an API call not matching any subtree in the dataset; 'accuracy' is defined via AST subtree matching; retriever variants are all defined operationally.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states it contributes (1) Gorilla, a fine-tuned LLaMA model for API calls, and (2) APIBench, a benchmark of ML model hub APIs with self-instruct instruction-API pairs.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 positions Gorilla against Toolformer, GPT-4 plugins, HuggingGPT, TaskMatrix.AI, and prior tool-augmented LLM work, explaining how the systematic evaluation and fine-tuning approach differs.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The abstract explicitly states 'Gorilla's code, model, data, and demo are available at https://gorilla.cs.berkeley.edu'.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The APIBench dataset (1,645 API calls, 16,450 instruction-API pairs) is released publicly at the project website.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper specifies training hardware (8xA100 40G) and hyperparameters but provides no requirements.txt, Dockerfile, or complete dependency list for reproduction.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Training hyperparameters and dataset construction are described but no step-by-step reproduction instructions are provided in the paper; a website is referenced but no structured guide is given.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 1-3 and figures are single point estimates with no confidence intervals, error bars, or standard deviations across runs.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any of the comparative claims; all comparisons are made by inspecting raw percentage differences.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute percentage improvements are reported with baseline context (e.g., '20.43% better than GPT-4', '83% improvement over LLaMA'), which constitutes effect size reporting even without confidence intervals.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Sample sizes reflect available APIs (94 TorchHub, 626 TensorHub, 925 HuggingFace) with no power analysis or justification for whether these sizes are adequate to detect differences.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or multi-run statistics are reported; all results appear to be single evaluation runs.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "GPT-4, GPT-3.5-turbo, Claude, and LLaMA-7B are all included as baselines across all retriever settings.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include GPT-4 (gpt-4-0314), GPT-3.5-turbo (gpt-3.5-turbo-0301), and Claude (claude-v1) — the strongest available models at the time of publication.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Table 2 provides an ablation comparing Gorilla trained with vs without retriever across all retrieval settings at test time, demonstrating the effect of retriever-aware training.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Three metrics are reported: overall accuracy, hallucination error rate, and wrong-API error rate; constraint-aware accuracy is additionally evaluated in Table 3.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "No human evaluation of outputs is conducted; this is a benchmark-driven evaluation using AST matching, which is appropriate for this task type.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper states 'we have maintained a holdout test set on which we report our findings,' with 80/20 train/test splits for TorchHub and TensorHub, and 90/10 for HuggingFace.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down separately for TorchHub, HuggingFace, and TensorHub across all retrieval conditions; constraint-aware API calls are also analyzed separately.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Appendix Section 8.3.2 and Figure 9 show specific hallucination examples from GPT-4; the paper discusses patterns of failure including arbitrary GitHub repository names and 'your_model_name' placeholders.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper explicitly reports that 'augmenting a LLM with retrieval, does not always lead to improved performance' and that non-optimal retrievers cause large accuracy drops (e.g., 52.27% degradation on HuggingFace with BM25).",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Exact checkpoint versions are specified: gpt-4-0314, gpt-3.5-turbo-0301, claude-v1, and LLaMA-7B; Gorilla is described as fine-tuned from LLaMA-7B.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Figure 8 shows full example prompts including zero-shot and retrieval-augmented formats; the instruction template structure is described and exemplified.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Table 4 explicitly lists learning rate (2e-5), batch size (64), epochs (5), warmup ratio (0.03), weight decay (0), and max sequence length (2048).",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The retrieval pipeline is described in detail — how BM25 and GPT-Index retrieve documents, the concatenation format ('<user_prompt> Use this API documentation for reference: <retrieved_API_doc_JSON>'), and Oracle retriever definition.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.1 documents filtering criteria for each hub (e.g., removing models with poor documentation, selecting top-20 per domain for HuggingFace), JSON field definitions, and self-instruct generation process.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The dataset (1,645 API calls, 16,450 instruction-API pairs) is released at the project website and explicitly mentioned as publicly available.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.1 describes the scraping process for all three hubs, filtering criteria, JSON schema construction, and the GPT-4-based self-instruct generation process in sufficient detail.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; data is collected by scraping public model hub pages, not through participant recruitment.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from model card scraping to JSON objects to self-instruct generation to instruction-API pairs to train/test split is documented across Section 3 and the Appendix.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "LLaMA-7B's pre-training data cutoff is not stated; for GPT-4 and other baselines, training cutoffs are not mentioned despite API documentation being public and potentially included in pretraining.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "The test set is derived from the same GPT-4-generated instruction pool as the training data (split 80/20 or 90/10); this distributional overlap is not discussed as a threat to validity.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The API documentation from public model hubs (HuggingFace, TorchHub, TensorHub) was publicly available before GPT-4's training cutoff; the possibility that baselines have seen this documentation in pretraining is not discussed.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference latency or cost figures are reported for Gorilla or the baseline API calls (GPT-4, etc.) despite practical deployment being a stated motivation.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Training hardware is mentioned (8xA100 40G, 5 epochs) but total GPU-hours or dollar cost of training and evaluation are not stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Gorilla (fine-tuned LLaMA-7B) surpasses GPT-4 on API call accuracy in zero-shot settings across all three benchmarks",
    374       "evidence": "Table 1: Gorilla zero-shot scores 59.13% vs GPT-4's 38.70% on TorchHub; 71.68% vs 19.80% on HuggingFace; 83.79% vs 18.20% on TensorHub",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Gorilla substantially reduces hallucination errors compared to GPT-4 and other LLMs",
    379       "evidence": "Table 1: Gorilla hallucination rate 6.98% vs GPT-4's 36.55% zero-shot on TorchHub; 10.95% vs 37.16% on HuggingFace; 5.40% vs 78.65% on TensorHub",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Retriever-aware training enables Gorilla to adapt to test-time API documentation changes without retraining",
    384       "evidence": "Figure 6 shows qualitative examples of Gorilla correctly updating API calls when retrieved documentation changes from one model version/registry to another",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Fine-tuning without a retriever outperforms prompting state-of-the-art LLMs with retrievers, and is 20.43% better than GPT-4",
    389       "evidence": "Table 1 zero-shot comparison shows Gorilla (59.13%) vs GPT-4 (38.70%) on TorchHub; claim of '20.43% better' is stated in text",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Adding a non-optimal retriever at test time to a model not trained with it hurts performance",
    394       "evidence": "Table 2: Gorilla trained without retriever drops from 59.13% (zero-shot) to 37.63% (BM25) and 11.28% on HuggingFace with BM25",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Gorilla can reason about constraints (model size, accuracy thresholds) when selecting APIs",
    399       "evidence": "Table 3 shows Gorilla achieves 47.88% constraint accuracy in zero-shot vs GPT-4's 43.66%, but GPT-3.5 leads at 73.94%; constraint evaluation is on a filtered 65.26% subset of TorchHub",
    400       "supported": "weak"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "Gorilla, a LLaMA-7B model fine-tuned on synthetic instruction-API pairs from model hubs, substantially outperforms GPT-4 on API call generation accuracy while drastically reducing hallucination (6.98% vs 36.55% on TorchHub zero-shot). Retriever-aware training enables the model to adapt to test-time API documentation changes. A critical finding is that non-optimal retrievers can significantly hurt performance for models not trained with them, while retriever-aware training with oracle documents achieves the best absolute results. The HuggingFace comparison is methodologically unequal — competitors are evaluated on domain classification only while Gorilla is evaluated on full API accuracy.",
    407   "red_flags": [
    408     {
    409       "flag": "Unequal HuggingFace evaluation metric",
    410       "detail": "For HuggingFace, all baseline models (GPT-4, GPT-3.5, Claude, LLaMA) are only evaluated on domain classification (multi-choice), while Gorilla is evaluated on full API accuracy. This makes the HuggingFace comparison fundamentally invalid — Gorilla's 71.68% vs GPT-4's 19.80% are measuring different things."
    411     },
    412     {
    413       "flag": "Test/train distributional overlap",
    414       "detail": "The test set is split from the same GPT-4-generated instruction pool used for training (90/10 for HuggingFace, 80/20 for others). If GPT-4 generated both training and test instructions with similar patterns, fine-tuned Gorilla may be pattern-matching the instruction style rather than generalizing."
    415     },
    416     {
    417       "flag": "No statistical significance testing",
    418       "detail": "All comparisons are single point estimates with no confidence intervals, error bars, or significance tests. Small absolute differences (e.g., Gorilla 61.82% vs GPT-4 59.13% with GPT-retriever on TorchHub) are presented as meaningful without statistical validation."
    419     },
    420     {
    421       "flag": "Benchmark contamination unaddressed",
    422       "detail": "Public model hub documentation (HuggingFace, TorchHub, TensorHub) was available before GPT-4's training cutoff. The paper does not discuss whether GPT-4's pretraining includes this data, which would affect the fairness of zero-shot baseline comparisons."
    423     },
    424     {
    425       "flag": "Synthetic evaluation data from GPT-4",
    426       "detail": "Both training and test instructions were generated by GPT-4 via self-instruct. The evaluation therefore partly measures how well Gorilla learned GPT-4's instruction phrasings rather than how well it generalizes to real user queries."
    427     },
    428     {
    429       "flag": "Constraint evaluation on skewed subset",
    430       "detail": "Table 3 (constraint-aware API calls) is evaluated only on the 65.26% of TorchHub APIs that have accuracy information, which is a non-random subset and makes generalization claims about constraint handling uncertain."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    436       "relevance": "Foundational prior work on LLM tool use; Gorilla directly positions itself as addressing a gap that Toolformer leaves (systematic evaluation and scaling to massive APIs)"
    437     },
    438     {
    439       "title": "GPT-4 Technical Report",
    440       "relevance": "Primary baseline model; the claim that Gorilla outperforms GPT-4 is central to the paper's contribution"
    441     },
    442     {
    443       "title": "LLaMA: Open and Efficient Foundation Language Models",
    444       "relevance": "Base model that Gorilla is fine-tuned from; enables the open-source aspect of the contribution"
    445     },
    446     {
    447       "title": "Self-Instruct: Aligning Language Model with Self Generated Instructions",
    448       "relevance": "Core methodology used to generate the 16,450 instruction-API pairs in APIBench"
    449     },
    450     {
    451       "title": "TaskMatrix.AI: Completing Tasks by Connecting Foundation Models with Millions of APIs",
    452       "relevance": "Concurrent related work on large-scale API calling; paper distinguishes Gorilla by systematic training and evaluation"
    453     },
    454     {
    455       "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace",
    456       "relevance": "Related work using LLMs to call HuggingFace models; Gorilla covers similar territory with a fine-tuning rather than prompting approach"
    457     },
    458     {
    459       "title": "WebGPT: Browser-Assisted Question-Answering with Human Feedback",
    460       "relevance": "Representative prior work augmenting LLMs with external tools; establishes the research context"
    461     },
    462     {
    463       "title": "Evaluating Large Language Models Trained on Code (HumanEval/Codex)",
    464       "relevance": "Prior work on LLM code evaluation; Gorilla's AST-based evaluation methodology relates to this line of work"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 3,
    470       "justification": "Directly addresses a concrete failure mode (LLMs hallucinating API calls) that affects anyone building LLM-powered applications on top of model hubs."
    471     },
    472     "surprise_contrarian": {
    473       "score": 2,
    474       "justification": "A 7B parameter fine-tuned open model beating GPT-4 on a specific task challenges the conventional wisdom that scale dominates; also shows that retrieval sometimes hurts, not helps."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "No safety or risk concerns raised; the paper is a straightforward capability improvement paper."
    479     },
    480     "drama_conflict": {
    481       "score": 1,
    482       "justification": "The 'small model beats GPT-4' narrative has mild drama appeal but no controversy or conflict beyond normal academic competition."
    483     },
    484     "demo_ability": {
    485       "score": 3,
    486       "justification": "A live demo is explicitly available at gorilla.cs.berkeley.edu and the model, code, and data are all released for immediate use."
    487     },
    488     "brand_recognition": {
    489       "score": 2,
    490       "justification": "UC Berkeley (Sky Computing Lab) and Microsoft Research are both strong brands; the paper also benchmarks against GPT-4 and Claude which adds name recognition."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [
    495       {
    496         "hn_id": "36888948",
    497         "title": "Google Med-Palm M: Towards Generalist Biomedical AI",
    498         "points": 110,
    499         "comments": 87,
    500         "url": "https://news.ycombinator.com/item?id=36888948",
    501         "created_at": "2023-07-27T04:32:46Z"
    502       },
    503       {
    504         "hn_id": "39594687",
    505         "title": "Sigmoid Loss for Language Image Pre-Training (2023)",
    506         "points": 32,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=39594687",
    509         "created_at": "2024-03-04T19:17:17Z"
    510       },
    511       {
    512         "hn_id": "41539236",
    513         "title": "Teaching Models to Express Their Uncertainty in Words (2022)",
    514         "points": 7,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=41539236",
    517         "created_at": "2024-09-14T11:57:40Z"
    518       },
    519       {
    520         "hn_id": "36183062",
    521         "title": "Gorilla: Large Language Model Connected with APIs",
    522         "points": 5,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=36183062",
    525         "created_at": "2023-06-04T04:19:26Z"
    526       },
    527       {
    528         "hn_id": "31609508",
    529         "title": "Attribution-Based Explanations That Provide Recourse Cannot Be Robust",
    530         "points": 3,
    531         "comments": 1,
    532         "url": "https://news.ycombinator.com/item?id=31609508",
    533         "created_at": "2022-06-03T15:20:31Z"
    534       },
    535       {
    536         "hn_id": "39271919",
    537         "title": "Augmenting Scholarly Documents Through AI-Powered Interactive Reading Interfaces",
    538         "points": 3,
    539         "comments": 0,
    540         "url": "https://news.ycombinator.com/item?id=39271919",
    541         "created_at": "2024-02-06T07:46:42Z"
    542       },
    543       {
    544         "hn_id": "36069263",
    545         "title": "Gorilla: Large Language Model Connected with APIs",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=36069263",
    549         "created_at": "2023-05-25T11:13:58Z"
    550       },
    551       {
    552         "hn_id": "36111250",
    553         "title": "How Language Model Hallucinations Can Snowball",
    554         "points": 2,
    555         "comments": 1,
    556         "url": "https://news.ycombinator.com/item?id=36111250",
    557         "created_at": "2023-05-29T06:20:14Z"
    558       },
    559       {
    560         "hn_id": "36075589",
    561         "title": "Model Evaluation for Extreme Risks",
    562         "points": 2,
    563         "comments": 1,
    564         "url": "https://news.ycombinator.com/item?id=36075589",
    565         "created_at": "2023-05-25T20:16:30Z"
    566       },
    567       {
    568         "hn_id": "37299645",
    569         "title": "Objects in JWST's mirrors are closer than they appear",
    570         "points": 2,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=37299645",
    573         "created_at": "2023-08-28T19:58:02Z"
    574       }
    575     ],
    576     "top_points": 110,
    577     "total_points": 168,
    578     "total_comments": 90
    579   }
    580 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs