scan-v5.json (28502B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Manipulating Multimodal Agents via Cross-Modal Prompt Injection", 6 "authors": [ 7 "Le Wang", 8 "Zonghao Ying", 9 "Tianyuan Zhang", 10 "Siyuan Liang", 11 "Shengshan Hu", 12 "Mingchuan Zhang", 13 "Aishan Liu", 14 "Xianglong Liu" 15 ], 16 "year": 2025, 17 "venue": "ACM Multimedia", 18 "arxiv_id": "2504.14348", 19 "doi": "10.1145/3746027.3755211" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": false, 26 "justification": "The abstract claims 'at least a +30.1% increase in attack success rates' but the conclusion of the same paper states 'at least +26.4% increase in attack success rates' — a direct numerical inconsistency for the same core claim within the same paper.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Ablation studies in Tables 2–4 systematically remove each component (visual alignment, textual enhancement, surrogate LLM choice) to isolate causal contributions, providing adequate support for component-level causal claims.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper tests only 2 custom-built chatbots (RecipeMaster, PoetryGenius) with 2 VLMs and 1 physical vehicle over 10 trials, yet makes broad claims about 'multimodal agents' and 'safety-critical applications' without bounding these to the tested narrow scope.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper does not consider alternative explanations for CrossInject's effectiveness, such as whether the combined attack surface area (rather than the specific optimization) is the primary driver, or whether the custom agents are unusually vulnerable.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": false, 50 "justification": "Attack Success Rate is measured by Qwen-Max as LLM-as-a-Judge, which is a proxy for actual malicious execution; the paper does not discuss how well this proxy correlates with real-world attack outcomes or validate the judge's reliability.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper contains only a 2-item bullet list in the conclusion labeled 'Limitations' that frames both points as future work opportunities ('we would like to explore'), not as honest admissions of current methodological weaknesses.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": false, 64 "justification": "No threats to validity are discussed; the paper does not address concerns such as LLM judge reliability, the use of custom-built rather than production agents, or the very small physical-world sample size (n=10).", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": false, 70 "justification": "No explicit scope boundaries are stated; the paper makes no statement about what settings the results do NOT apply to, despite testing in a narrow and controlled environment.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "No funding acknowledgment or grant information appears anywhere in the paper text provided.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations are clearly listed on the first page: Beihang University, National University of Singapore, Huazhong University of Science and Technology, and Henan University of Science and Technology.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": false, 89 "answer": false, 90 "justification": "No funding disclosed, so independence cannot be assessed.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests, patents, or financial interests statement appears anywhere in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Key terms are defined: 'cross-modal prompt injection' is formally defined in Section 3 with equations; 'multimodal agents' are defined in Section 2.1; 'ASR' and 'PNA' are defined in the evaluation metrics subsection.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "The intended contribution is explicitly listed as three bullet points: identifying the cross-modal injection vulnerability, proposing the CrossInject framework, and providing empirical evaluations across digital and physical agents.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 2 provides substantive engagement with related work on multimodal agents and prompt injection, clearly positioning CrossInject against JIP (visual unimodal) and FB (textual unimodal) and explaining why prior approaches fail in multimodal settings.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper states 'All code is implemented in PyTorch' but provides no repository URL, code release, or promise of release anywhere in the paper.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "The evaluation uses standard public benchmarks: CoEDIT (arXiv:2305.09857) and SST2 (Socher et al., 2013), both publicly available.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Only 'NVIDIA A800-SXM4-80GB GPU cluster' and 'PyTorch' are mentioned; no requirements.txt, Dockerfile, CUDA version, PyTorch version, or dependency list is provided.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "No step-by-step reproduction instructions are provided; hyperparameters are scattered across the setup section but there is no reproducible workflow.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "Results are averaged across 3 runs but no confidence intervals, error bars, or standard deviations are reported in any table or figure.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "No statistical significance tests are used for any comparative claims, including the core claim that CrossInject outperforms baselines.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Percentage-point improvements over baselines are reported throughout (e.g., '+32.7% average performance gain' in local document scenarios, '+18.7% average drop' when removing visual alignment) with baseline context.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "100 samples per task dataset is not justified; the physical world evaluation uses only 10 trials with no power analysis or justification for either sample size.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper states 'each experiment was repeated three times, and the average results are reported' but no standard deviation, variance, or spread is provided for any result.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Three baselines are included: Naive (direct instruction), JIP (visual modality injection, ICLR 2024), and FB (textual delimiter injection, USENIX 2024).", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "JIP (Shayegani et al., ICLR 2024) and FB (Liu et al., USENIX Security 2024) are contemporary and represent state-of-the-art unimodal approaches at the time of writing.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Extensive ablations are conducted: visual alignment variants (4 configurations), perturbation budget sweep (ε=2 to 32), textual enhancement variants (4 configurations), optimization iterations (50–150), and surrogate LLM comparison (4 models).", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Two metrics are used: Attack Success Rate (ASR) measuring attack effectiveness and Performance under No Attack (PNA) measuring baseline agent capability.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": true, 208 "answer": false, 209 "justification": "No human evaluation is conducted; ASR is determined entirely by Qwen-Max as an LLM-as-a-Judge, which is an automated proxy for human judgment of attack success.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "100 randomly sampled entries from public test splits (CoEDIT, SST2) are used as the injected instruction evaluation set; these are not used during the GCG or visual optimization process.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Results are broken down across all combinations of agent role (RM/PG), VLM backbone (Qwen2-VL/Phi-3.5-vision), task (Text Editing/Sentiment Analysis), and attack surface (local document/online webpage).", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper discusses failure of baseline methods (JIP at 0% ASR) but does not discuss cases where CrossInject itself fails or underperforms, nor analyze the conditions under which the attack succeeds vs. fails.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "The paper reports that JIP achieves 0% ASR in all scenarios (a strong negative result for visual-only attacks) and that online webpages produce ~10.8% lower ASR than local documents.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": false, 241 "justification": "Models are named (Qwen2-VL 2B, Phi-3.5-vision, Llama-3.1-8B-Instruct, Stable-Diffusion-3.5-Large, Qwen-Max) but no snapshot dates, checkpoint hashes, or specific versioned identifiers are provided.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": false, 247 "justification": "The paper describes adversarial meta prompting and GCG optimization but does not provide the actual system prompts used for RecipeMaster or PoetryGenius, nor the LLM judge evaluation prompt.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Hyperparameters are reported: visual perturbation ε=16, 200 iterations; GCG top-k=256, batch size=512, 100 iterations; Gaussian blur 9×9 kernel; max new tokens=1024.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": false, 259 "justification": "The agent architectures (RecipeMaster, PoetryGenius) are described only at a high level ('multimodal data processing, API calls, logical reasoning'); no implementation code, tool definitions, or workflow diagrams sufficient for replication are provided.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "The paper documents preprocessing for both attack surfaces: malicious instructions embedded in HTML5 tags with whitespace characters for webpage attacks, and direct text embedding for local document attacks.", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "Raw experimental outputs (attack logs, LLM judge responses, per-sample results) are not released or made available.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "The evaluation datasets are standard public benchmarks; 100 entries are randomly sampled from each, which is described; the physical world setup is photographically documented in Figure 5.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants are involved in this study.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": false, 291 "justification": "The attack pipeline is described at a methodological level but the full data pipeline from dataset sampling through LLM judge scoring to aggregated ASR computation is not documented.", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Neither Qwen2-VL nor Phi-3.5-vision training data cutoffs are stated, despite SST2 (2013) being highly likely to appear in their training data.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of whether VLM training data included SST2 (a 2013 benchmark) or CoEDIT (2023), which could affect agent behavior on the injected tasks.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "SST2 is a widely-used 2013 benchmark almost certainly in all modern VLM training corpora; the paper does not address whether this affects the validity of using it as the injected task target.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "No inference cost, latency, or runtime estimates are reported for any component of the attack pipeline.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "The paper mentions an 'NVIDIA A800-SXM4-80GB GPU cluster' but does not state total GPU-hours, number of GPUs, or overall compute budget for the experiments.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "CrossInject achieves at least +30.1% higher ASR than existing prompt injection methods across diverse tasks", 378 "evidence": "Table 1 shows improvements over baselines; however, the conclusion states '+26.4%' for the same claim, creating an internal inconsistency. Local document gains average 32.7%, online webpage gains average 27.5%.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Visual-only prompt injection (JIP) achieves 0% ASR against state-of-the-art multimodal agents", 383 "evidence": "Table 1 consistently shows JIP at 0% ASR across all 16 experimental conditions (2 agents × 2 VLMs × 2 tasks × 2 attack surfaces).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Both visual and textual components are individually necessary for effective cross-modal injection", 388 "evidence": "Tables 2 and 3 show ablations: removing visual alignment causes average 18.7% ASR drop; removing textual enhancement causes average 24.8% ASR drop.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "CrossInject is effective against real-world autonomous driving agents, achieving 9/10 attack success", 393 "evidence": "Physical world experiment on LIMO vehicle: 9/10 ASR with CrossInject vs. 4/10 with naive textual attack. Only 10 trials with no statistical analysis.", 394 "supported": "weak" 395 }, 396 { 397 "claim": "Existing defenses (sandwich prompting, Gaussian blur) are largely ineffective against CrossInject", 398 "evidence": "Table 5 shows textual defense reduces ASR by average 6.7%, visual defense by 2.8%, combined by slightly more; CrossInject maintains >70% ASR in all conditions even with combined defense.", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "Aligning visual input with a generated target image outperforms direct text-image feature alignment", 403 "evidence": "Table 2 ablation shows 'Align with image (Ours)' outperforms 'Align with text' by 11.1% on average, validating the text-to-image intermediate step.", 404 "supported": "moderate" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval", 409 "case-study" 410 ], 411 "key_findings": "CrossInject demonstrates that coordinated cross-modal injection targeting both visual and textual inputs is substantially more effective than unimodal attacks against multimodal agents, with visual-only attacks (JIP) achieving exactly 0% ASR across all tested conditions while CrossInject achieves 25–97% ASR depending on model and task. Ablation studies establish that both visual latent alignment (via text-to-image generation) and textual guidance enhancement (via GCG optimization) are individually necessary, with their removal causing 19% and 25% average ASR drops respectively. Existing defenses reduce ASR by only 3–7% on average, suggesting the attack surfaces a fundamental vulnerability in current multimodal agent architectures. A physical-world demonstration on an autonomous driving vehicle shows 9/10 attack success inducing near-collision behavior.", 412 "red_flags": [ 413 { 414 "flag": "Abstract/conclusion number inconsistency", 415 "detail": "The abstract claims '+30.1% increase in ASR' while the conclusion states '+26.4% increase in ASR' for the same core performance claim within the same paper version." 416 }, 417 { 418 "flag": "Narrow evaluation scope with broad claims", 419 "detail": "Claims about 'multimodal agents' broadly are supported by only 2 custom-built research chatbots (RecipeMaster, PoetryGenius) that are not production systems, plus 10 trials on one physical robot." 420 }, 421 { 422 "flag": "LLM judge reliability unvalidated", 423 "detail": "ASR is entirely determined by Qwen-Max as LLM-as-a-Judge; no human verification or judge reliability analysis is provided, making the primary metric's validity uncertain." 424 }, 425 { 426 "flag": "No variance despite 3 repetitions", 427 "detail": "The paper reports averaging over 3 runs but provides no standard deviations or confidence intervals anywhere, masking potential instability in the attack." 428 }, 429 { 430 "flag": "Physical world experiment underpowered", 431 "detail": "The physical autonomous driving demonstration uses only 10 trials with no statistical analysis, power justification, or controlled conditions beyond a single manually constructed track." 432 }, 433 { 434 "flag": "Benchmark contamination ignored", 435 "detail": "SST2 (2013) and CoEDIT (2023) used as injected task targets are almost certainly in the training data of both Qwen2-VL and Phi-3.5-vision; the paper does not address how this affects ASR validity." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Prompt Injection attack against LLM-integrated Applications", 441 "relevance": "Foundational prompt injection taxonomy and attack framework for LLM-integrated applications; CrossInject builds directly on this work." 442 }, 443 { 444 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 445 "relevance": "Primary textual baseline (FB method) compared against in all experiments; provides formal benchmark for prompt injection evaluation." 446 }, 447 { 448 "title": "Jailbreak in pieces: Compositional adversarial attacks on multi-modal language models", 449 "relevance": "Primary visual baseline (JIP method) compared against; shows visual-only injection fails against modern multimodal agents." 450 }, 451 { 452 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 453 "relevance": "GCG algorithm used directly for textual injection optimization in CrossInject's Textual Guidance Enhancement component." 454 }, 455 { 456 "title": "Abusing Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs", 457 "relevance": "Prior work on instruction injection via visual/audio modalities under white-box settings; CrossInject extends this to black-box multimodal agents." 458 }, 459 { 460 "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models", 461 "relevance": "Key related work on indirect prompt injection in agent external data sources; informs CrossInject's external data implantation threat model." 462 }, 463 { 464 "title": "Dissecting Adversarial Robustness of Multimodal LM Agents", 465 "relevance": "Related analysis of adversarial robustness specifically in multimodal LM agent contexts; directly relevant survey paper." 466 }, 467 { 468 "title": "Large Multimodal Agents: A Survey", 469 "relevance": "Survey of multimodal agent architectures (perception-planning-action) that CrossInject targets; defines the agent paradigm attacked." 470 }, 471 { 472 "title": "InjectAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 473 "relevance": "Benchmark for indirect prompt injection in tool-integrated agents; contextualizes CrossInject within the broader injection attack landscape." 474 } 475 ], 476 "engagement_factors": { 477 "practical_relevance": { 478 "score": 2, 479 "justification": "Demonstrates a realistic black-box attack against deployed multimodal agent services (cloud APIs, chatbots), directly actionable for security practitioners assessing agent deployments." 480 }, 481 "surprise_contrarian": { 482 "score": 2, 483 "justification": "The finding that visual-only adversarial attacks (JIP) achieve exactly 0% ASR against modern multimodal agents challenges the assumption that visual adversarial perturbations transfer to agent manipulation." 484 }, 485 "fear_safety": { 486 "score": 3, 487 "justification": "Physically demonstrates that an autonomous driving agent can be induced to ignore stop signs and cause collisions via cross-modal injection, directly raising safety-critical AI risk concerns." 488 }, 489 "drama_conflict": { 490 "score": 2, 491 "justification": "Shows that existing defenses (sandwich prompting, Gaussian blur) provide only 3–7% protection against the attack, creating a strong conflict between the deployed defense landscape and actual security." 492 }, 493 "demo_ability": { 494 "score": 1, 495 "justification": "The attack requires significant ML infrastructure (A800 GPU cluster, multiple large models, GCG optimization), no code is released, and the custom agent setup is not publicly available." 496 }, 497 "brand_recognition": { 498 "score": 1, 499 "justification": "Beihang University is a recognized Chinese research institution; the paper tests against recognizable models (Qwen2-VL, Phi-3.5-vision) but has no famous Western lab affiliation." 500 } 501 }, 502 "hn_data": { 503 "threads": [ 504 { 505 "hn_id": "45341511", 506 "title": "Learn Your Way: Towards an AI-Augmented Textbook, Google Research", 507 "points": 3, 508 "comments": 0, 509 "url": "https://news.ycombinator.com/item?id=45341511" 510 }, 511 { 512 "hn_id": "44619169", 513 "title": "Palatable Conceptions of Disembodied Being", 514 "points": 3, 515 "comments": 0, 516 "url": "https://news.ycombinator.com/item?id=44619169" 517 }, 518 { 519 "hn_id": "43411379", 520 "title": "New Computer with intergrated Brain Computer interface", 521 "points": 3, 522 "comments": 0, 523 "url": "https://news.ycombinator.com/item?id=43411379" 524 }, 525 { 526 "hn_id": "45461534", 527 "title": "Comparing Quantum Annealing and BF-DCQO", 528 "points": 2, 529 "comments": 0, 530 "url": "https://news.ycombinator.com/item?id=45461534" 531 }, 532 { 533 "hn_id": "27038055", 534 "title": "The Breakthrough Listen Search for Intelligent Life Near the Galactic Center I", 535 "points": 2, 536 "comments": 0, 537 "url": "https://news.ycombinator.com/item?id=27038055" 538 }, 539 { 540 "hn_id": "45302119", 541 "title": "VCBench: Benchmarking LLMs in Venture Capital", 542 "points": 1, 543 "comments": 0, 544 "url": "https://news.ycombinator.com/item?id=45302119" 545 }, 546 { 547 "hn_id": "43417925", 548 "title": "Bioscience Lab in home for your Brain and Body, control laptop via mind", 549 "points": 1, 550 "comments": 0, 551 "url": "https://news.ycombinator.com/item?id=43417925" 552 }, 553 { 554 "hn_id": "40129452", 555 "title": "A Survey on Self-Evolution of Large Language Models", 556 "points": 1, 557 "comments": 0, 558 "url": "https://news.ycombinator.com/item?id=40129452" 559 } 560 ], 561 "top_points": 3, 562 "total_points": 16, 563 "total_comments": 0 564 } 565 }