scan-v4.json (29626B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis", 6 "authors": [ 7 "Dipin Khati", 8 "Daniel Rodriguez-Cardenas", 9 "Paul Pantzer", 10 "Denys Poshyvanyk" 11 ], 12 "year": 2026, 13 "venue": "FORGE '26", 14 "arxiv_id": "2601.19106", 15 "doi": "10.1145/3793655.3793725" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "Abstract claims of 100% precision, 87.6% recall, 0.934 F1, and 77.0% fix accuracy are all directly supported by results in §3 and Tables 1-4.", 23 "source": "opus" 24 }, 25 "causal_claims_justified": { 26 "applies": false, 27 "answer": false, 28 "justification": "The paper does not make causal claims. It reports detection/correction performance of a deterministic tool without claiming causal relationships.", 29 "source": "opus" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "The abstract claims the framework offers 'a clear path toward trustworthy code generation' broadly, but it was tested only on 200 Python snippets across 5 libraries. The title says 'LLM-Generated Code' without bounding to Python. §4 acknowledges some limitations but the framing in abstract/title/conclusion overgeneralizes.", 35 "source": "opus" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "No alternative explanations for the results are discussed. For instance, the high precision could partly reflect the simplicity of the curated dataset rather than inherent framework reliability.", 41 "source": "opus" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper measures detection/correction on a curated dataset but frames results as 'trustworthy code generation' and 'reliable alternative to probabilistic repair' without discussing the gap between curated-dataset performance and real-world code generation trustworthiness.", 47 "source": "opus" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "§4 (Discussion and Future Work) contains a substantial paragraph beginning 'We must acknowledge the limitations of this study' discussing dataset size, library coverage, single-file analysis, and scope of targeted errors.", 55 "source": "opus" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "§4 discusses specific threats: '200-sample dataset...is not exhaustive', 'error distribution may not reflect real-world prevalence', 'Knowledge Base was limited to five Python libraries', 'does not yet handle multi-module dataflows'.", 61 "source": "opus" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "§4 explicitly states: 'our approach deliberately targets KCHs and does not attempt to solve more complex, multi-line logical errors' and 'currently focuses on single-file, function-level analysis.'", 67 "source": "opus" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding source is mentioned anywhere in the paper.", 75 "source": "opus" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors are listed with William & Mary affiliation. No product being evaluated is affiliated with the authors.", 81 "source": "opus" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict.", 87 "source": "opus" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests or financial interests statement is present in the paper.", 93 "source": "opus" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "KCH defined as 'subtle, semantic errors' with two types: 'API Knowledge Conflicts' (non-existent functions/parameters) and 'Identifier Knowledge Conflicts' (variable misuse). Examples (pd.read_exel, max_len_str) provided.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Clear RQ1/RQ2 framework: deterministic static-analysis for detecting KCHs via AST+KB introspection. Novel: post-processing correction (vs. constrained decoding or LLM-in-the-loop approaches). Contribution explicitly positioned relative to prior work.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Engages with PICARD/Synchromesh (prevention-based), LLM-in-the-loop repair, Structural Trimming (deletion-based). Shows how this work differs (deterministic, correction-focused, resolution vs. deletion). Related work section (§5) references hallucination taxonomy work.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "Replication package provided at GitHub (ref [3]: https://github.com/WM-SEMERU/Hallucinations-in-Code), explicitly stated as publicly available in §1.", 124 "source": "opus" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "The 200-sample dataset is stated to be part of the replication package: 'All data, code, and experimental configurations are publicly available in our replication package [3]' (§1).", 130 "source": "opus" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper. Only a GitHub link is provided.", 136 "source": "opus" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is mentioned but no instructions for running it are given.", 142 "source": "opus" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "Only point estimates are reported (100% precision, 87.6% recall, 77.0% fix accuracy). No confidence intervals or error bars.", 150 "source": "opus" 151 }, 152 "significance_tests": { 153 "applies": false, 154 "answer": false, 155 "justification": "The paper does not make comparative claims between systems. It evaluates a single deterministic system with no stochastic comparisons requiring significance tests.", 156 "source": "opus" 157 }, 158 "effect_sizes_reported": { 159 "applies": false, 160 "answer": false, 161 "justification": "No comparative claims are made between systems, so effect sizes are not applicable. The paper reports absolute performance of a single system.", 162 "source": "opus" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The dataset has 200 samples (161 hallucinated, 39 clean) but no justification for why this size was chosen or whether it is adequate for the claims made.", 168 "source": "opus" 169 }, 170 "variance_reported": { 171 "applies": false, 172 "answer": false, 173 "justification": "The system is fully deterministic ('completed in under 0.2 seconds', §2.5). There are no stochastic runs to report variance across.", 174 "source": "opus" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": false, 181 "justification": "No baseline systems are compared against. The paper only reports its own framework's performance. Related work discusses PICARD, Synchromesh, LLM-in-the-loop repair, and Structural Trimming but does not compare against them experimentally.", 182 "source": "opus" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": false, 187 "justification": "No baselines are included, so contemporaneity cannot be assessed.", 188 "source": "opus" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": false, 193 "justification": "The framework has multiple components (AST parser, KB, validation rules for unknown API/bare calls/semantic inconsistency, correction module) but no ablation study isolating their contributions.", 194 "source": "opus" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Reports precision, recall, F1-score for detection, and fix accuracy for correction (Tables 1-4, §3).", 200 "source": "opus" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": false, 205 "justification": "No human evaluation of the corrections is mentioned. Evaluation is entirely automated. Human evaluation would be relevant to assess whether corrections are semantically appropriate.", 206 "source": "opus" 207 }, 208 "held_out_test_set": { 209 "applies": false, 210 "answer": false, 211 "justification": "The system is not trained/tuned — it is a deterministic rule-based framework. There is no training/validation/test split concern.", 212 "source": "opus" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Tables 3 and 4 provide breakdowns by KCH type (Missing Imports, Mis-typed API Calls, Contextual Mismatches) and by library (numpy, pandas, matplotlib, json, requests).", 218 "source": "opus" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "§4 provides manual analysis of 37 failed cases (20 false negatives, 17 failed corrections) with specific examples like 'plt.plotx instead of plt.plot' and the surface-typo-vs-semantic-error problem.", 224 "source": "opus" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Contextual Mismatches had only 33.3% detection rate and 0.0% correction accuracy (Table 3). Pandas had only 56.2% correction accuracy (Table 4). These are clearly negative results.", 230 "source": "opus" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": false, 237 "justification": "The dataset was generated using 'GPT-5' (§2.6) but no version, snapshot date, or API version is specified.", 238 "source": "opus" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": false, 243 "justification": "Dataset was generated by 'prompting GPT-5 with task-oriented instructions' (§2.6) but the actual prompts used are not provided.", 244 "source": "opus" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": false, 249 "justification": "No hyperparameters for GPT-5 generation (temperature, top-p, etc.) are reported. The edit-distance threshold for correction is also not specified.", 250 "source": "opus" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No agentic scaffolding is used. The framework is a deterministic static analysis pipeline.", 256 "source": "opus" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": false, 261 "justification": "The paper states 200 samples were 'curated' from GPT-5 output but does not describe how many were initially generated, what curation criteria were applied, or how the 161/39 hallucinated/clean split was determined.", 262 "source": "opus" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "Replication package [3] is stated to contain all data, code, and experimental configurations.", 270 "source": "opus" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "§2.6 describes dataset construction: 200 Python samples generated by prompting GPT-5 for 5 target libraries, composed of 161 hallucinated (3 categories) and 39 clean samples.", 276 "source": "opus" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants. Data is LLM-generated code snippets from a standard model, not a benchmark requiring recruitment description.", 282 "source": "opus" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": false, 287 "justification": "The paper does not describe how GPT-5 outputs were selected, filtered, or curated into the final 200. How many were generated initially? What criteria determined inclusion? This is undocumented.", 288 "source": "opus" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": false, 294 "answer": false, 295 "justification": "The system being evaluated is a deterministic static analysis tool, not a pre-trained model. Contamination of model training data is not relevant to evaluating this framework.", 296 "source": "opus" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": false, 300 "answer": false, 301 "justification": "The framework is rule-based, not trained. No train/test overlap concern exists.", 302 "source": "opus" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": false, 306 "answer": false, 307 "justification": "The framework is deterministic and not trained on any data. Benchmark contamination is not applicable.", 308 "source": "opus" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants in this study.", 316 "source": "opus" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants in this study.", 322 "source": "opus" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in this study.", 328 "source": "opus" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in this study.", 334 "source": "opus" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in this study.", 340 "source": "opus" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in this study.", 346 "source": "opus" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in this study.", 352 "source": "opus" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": true, 359 "justification": "§2.5 reports: 'the end-to-end analysis of all 200 samples completed in under 0.2 seconds on a single laptop CPU.'", 360 "source": "opus" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": true, 365 "justification": "The compute is minimal and stated: under 0.2 seconds on a single laptop CPU for all 200 samples (§2.5).", 366 "source": "opus" 367 } 368 }, 369 "experimental_rigor": { 370 "seed_sensitivity_reported": { 371 "applies": false, 372 "answer": false, 373 "justification": "The framework is fully deterministic with no random seeds. No stochastic component exists.", 374 "source": "opus" 375 }, 376 "number_of_runs_stated": { 377 "applies": false, 378 "answer": false, 379 "justification": "Deterministic system — a single run always produces the same output. Multiple runs are unnecessary.", 380 "source": "opus" 381 }, 382 "hyperparameter_search_budget": { 383 "applies": true, 384 "answer": false, 385 "justification": "The framework likely has tunable parameters (e.g., edit distance threshold for fuzzy matching) but no hyperparameter search is described.", 386 "source": "opus" 387 }, 388 "best_config_selection_justified": { 389 "applies": true, 390 "answer": false, 391 "justification": "No discussion of how design choices (e.g., edit distance thresholds, semantic cue definitions) were selected or validated.", 392 "source": "opus" 393 }, 394 "multiple_comparison_correction": { 395 "applies": false, 396 "answer": false, 397 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.", 398 "source": "opus" 399 }, 400 "self_comparison_bias_addressed": { 401 "applies": true, 402 "answer": false, 403 "justification": "The authors evaluate their own system on their own curated dataset without acknowledging the potential bias of designing both the tool and its evaluation data.", 404 "source": "opus" 405 }, 406 "compute_budget_vs_performance": { 407 "applies": false, 408 "answer": false, 409 "justification": "No comparison with other systems, so compute-matched comparison is not applicable.", 410 "source": "opus" 411 }, 412 "benchmark_construct_validity": { 413 "applies": true, 414 "answer": false, 415 "justification": "The 200-sample curated dataset's representativeness of real-world KCH distribution is not discussed. §4 briefly notes 'error distribution may not reflect real-world prevalence' but does not analyze construct validity.", 416 "source": "opus" 417 }, 418 "scaffold_confound_addressed": { 419 "applies": false, 420 "answer": false, 421 "justification": "No scaffolding is involved. The system is a standalone static analysis tool.", 422 "source": "opus" 423 } 424 }, 425 "data_leakage": { 426 "temporal_leakage_addressed": { 427 "applies": false, 428 "answer": false, 429 "justification": "The evaluated system is a deterministic rule-based tool, not a trained model. Temporal leakage of training data is not applicable.", 430 "source": "opus" 431 }, 432 "feature_leakage_addressed": { 433 "applies": false, 434 "answer": false, 435 "justification": "The evaluated system is rule-based, not a trained model. Feature leakage is not applicable.", 436 "source": "opus" 437 }, 438 "non_independence_addressed": { 439 "applies": false, 440 "answer": false, 441 "justification": "No trained model is evaluated. Non-independence of train/test data is not applicable.", 442 "source": "opus" 443 }, 444 "leakage_detection_method": { 445 "applies": false, 446 "answer": false, 447 "justification": "No trained model is evaluated. Leakage detection is not applicable.", 448 "source": "opus" 449 } 450 } 451 } 452 }, 453 "claims": [ 454 { 455 "claim": "Framework detects KCHs with 100% precision and 87.6% recall on the 200-sample dataset", 456 "evidence": "Table 1: 141 true positives, 0 false positives, 20 false negatives, 39 true negatives. Precision=141/141=100%, Recall=141/161=87.6%, F1=0.934.", 457 "supported": "strong" 458 }, 459 { 460 "claim": "Framework auto-corrects 77% of identified hallucinations (124/161)", 461 "evidence": "Table 2: Total hallucinated samples identified = 161, successfully corrected = 124, fix accuracy = 77.0%.", 462 "supported": "strong" 463 }, 464 { 465 "claim": "Missing imports have highest detection/correction rate (97.9%)", 466 "evidence": "Table 3: 48 samples, 97.9% detection rate, 97.9% correction rate.", 467 "supported": "strong" 468 }, 469 { 470 "claim": "Contextual mismatches are hardest type (33.3% detection, 0% correction)", 471 "evidence": "Table 3: Only 3 samples of contextual mismatches, 33.3% detection, 0/3 corrected (0% accuracy).", 472 "supported": "moderate" 473 }, 474 { 475 "claim": "Deterministic approach is reliable alternative to probabilistic repair methods", 476 "evidence": "Discussion contrasts with PICARD, Synchromesh, LLM-in-the-loop. No empirical comparison provided; claim is logical but not validated.", 477 "supported": "moderate" 478 }, 479 { 480 "claim": "Pandas library shows lower performance (56.2% correction) due to semantic complexity", 481 "evidence": "Table 4: pandas 85.1% detection but only 56.2% correction. Discussion notes pandas failures involve 'surface-level typo vs. semantic error' distinction.", 482 "supported": "moderate" 483 } 484 ], 485 "methodology_tags": [ 486 "benchmark-eval", 487 "case-study" 488 ], 489 "key_findings": "The paper presents a static-analysis framework that detects knowledge-conflicting hallucinations (KCHs) in LLM-generated code with 100% precision and 87.6% recall, and auto-corrects 77% of detected errors using AST parsing and library introspection. The framework excels on missing-import errors (97.9%) but struggles with contextual mismatches (0% correction) and library-dependent tasks like pandas (56.2% correction). A critical limitation is that 'functionally correct' code is claimed but only AST structure—not actual execution—is verified.", 490 "red_flags": [ 491 { 492 "flag": "No baseline comparisons", 493 "detail": "Paper claims superiority over PICARD, Synchromesh, LLM-in-the-loop, and mypy but provides zero empirical comparison. These baselines are discussed only conceptually, not tested on the same dataset." 494 }, 495 { 496 "flag": "Proxy outcome mismatch", 497 "detail": "Paper claims 77% fix accuracy for 'functionally correct, runnable code' but explicitly states code is analyzed 'without running the code.' Only AST structure correctness is verified, not actual execution." 498 }, 499 { 500 "flag": "Suspiciously high precision (100%)", 501 "detail": "Perfect precision on 200 manually-curated samples raises overfitting concerns. No confidence intervals or cross-validation. Could indicate dataset is too simple or heavily biased toward easy cases." 502 }, 503 { 504 "flag": "Severe variance by category", 505 "detail": "Contextual mismatches: 0/3 corrected (0%). Pandas: 56.2% correction. This extreme variance (0%–97.9%) suggests poor generalization and high library-dependence." 506 }, 507 { 508 "flag": "Missing generation prompts", 509 "detail": "Dataset created by prompting GPT-5 but actual prompts not provided. Cannot reproduce dataset generation or assess prompt bias." 510 }, 511 { 512 "flag": "No inter-rater agreement", 513 "detail": "200-sample dataset appears single-annotated by authors. No inter-rater reliability study to validate what constitutes a KCH." 514 }, 515 { 516 "flag": "Small sample with extreme confidence", 517 "detail": "200 samples is small. Only 3 contextual-mismatch samples. 100% precision and 77% correction claims on this scale lack robustness." 518 }, 519 { 520 "flag": "Generalization not tested", 521 "detail": "Results limited to 5 libraries (numpy, pandas, requests, matplotlib, json). Unclear if framework works on scipy, tensorflow, torch, or industry libraries." 522 }, 523 { 524 "flag": "No cost comparison to baselines", 525 "detail": "Paper reports 0.2s for framework but does not compare cost/quality tradeoff with LLM-in-the-loop or constrained decoding approaches." 526 }, 527 { 528 "flag": "Missing statistical rigor", 529 "detail": "No confidence intervals, p-values, or significance tests. Single evaluation without bootstrap, cross-validation, or hold-out validation." 530 } 531 ], 532 "cited_papers": [ 533 { 534 "title": "Bugs in Large Language Models Generated Code: An Empirical Study", 535 "relevance": "Foundational taxonomy of LLM code bugs; establishes hallucination as a known failure mode." 536 }, 537 { 538 "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation", 539 "relevance": "Defines Knowledge Conflicting Hallucinations (KCH) category; prior work this paper builds on." 540 }, 541 { 542 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 543 "relevance": "Motivates productivity gains and risks from LLM code tools; field evidence for the problem space." 544 }, 545 { 546 "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding", 547 "relevance": "Constrained decoding baseline; prevents syntax errors but misses semantic KCHs." 548 }, 549 { 550 "title": "Synchromesh: Reliable code generation from pre-trained language models", 551 "relevance": "Prevention-based approach; paper contrasts deterministic repair vs. prevention strategies." 552 }, 553 { 554 "title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models", 555 "relevance": "Error taxonomy; provides context for error classification and mitigation strategies." 556 }, 557 { 558 "title": "Mapping the Trust Terrain: LLMs in Software Engineering - Insights and Perspectives", 559 "relevance": "Trust and developer experience with LLM code generation; motivates reliability of fixes." 560 }, 561 { 562 "title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs", 563 "relevance": "Deletion-based repair approach; paper contrasts trimming (safety) vs. correction (functionality)." 564 } 565 ], 566 "engagement_factors": { 567 "practical_relevance": { 568 "score": 2, 569 "justification": "AST-based hallucination detection for LLM code is directly applicable to developer workflows, though the tool only covers 5 Python libraries currently." 570 }, 571 "surprise_contrarian": { 572 "score": 0, 573 "justification": "The idea that static analysis can catch API misuse is well-understood; the results confirm expectations rather than challenging them." 574 }, 575 "fear_safety": { 576 "score": 0, 577 "justification": "Addresses code correctness rather than safety, security, or misuse concerns." 578 }, 579 "drama_conflict": { 580 "score": 0, 581 "justification": "No controversy or conflict; positions itself as complementary to existing approaches without challenging specific claims." 582 }, 583 "demo_ability": { 584 "score": 1, 585 "justification": "Code is available on GitHub but requires setup with specific libraries and the custom dataset; not a quick-try tool." 586 }, 587 "brand_recognition": { 588 "score": 0, 589 "justification": "From William & Mary's SEMERU lab, not a widely recognized institution in the AI/ML community." 590 } 591 }, 592 "hn_data": { 593 "threads": [ 594 { 595 "hn_id": "46885582", 596 "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage", 597 "points": 3, 598 "comments": 0, 599 "url": "https://news.ycombinator.com/item?id=46885582", 600 "created_at": "2026-02-04T13:28:17Z" 601 }, 602 { 603 "hn_id": "47119379", 604 "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage", 605 "points": 2, 606 "comments": 1, 607 "url": "https://news.ycombinator.com/item?id=47119379", 608 "created_at": "2026-02-23T08:01:55Z" 609 }, 610 { 611 "hn_id": "46811142", 612 "title": "Anthropic: Who's in Charge? Disempowerment Patterns in Real-World LLM Usage", 613 "points": 2, 614 "comments": 1, 615 "url": "https://news.ycombinator.com/item?id=46811142", 616 "created_at": "2026-01-29T15:04:00Z" 617 }, 618 { 619 "hn_id": "47477667", 620 "title": "TinyTorch: Building Machine Learning Systems from First Principles", 621 "points": 2, 622 "comments": 0, 623 "url": "https://news.ycombinator.com/item?id=47477667", 624 "created_at": "2026-03-22T14:03:42Z" 625 } 626 ], 627 "top_points": 3, 628 "total_points": 9, 629 "total_comments": 2 630 } 631 }