scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29626B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis",
      6     "authors": [
      7       "Dipin Khati",
      8       "Daniel Rodriguez-Cardenas",
      9       "Paul Pantzer",
     10       "Denys Poshyvanyk"
     11     ],
     12     "year": 2026,
     13     "venue": "FORGE '26",
     14     "arxiv_id": "2601.19106",
     15     "doi": "10.1145/3793655.3793725"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims of 100% precision, 87.6% recall, 0.934 F1, and 77.0% fix accuracy are all directly supported by results in §3 and Tables 1-4.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper does not make causal claims. It reports detection/correction performance of a deterministic tool without claiming causal relationships.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The abstract claims the framework offers 'a clear path toward trustworthy code generation' broadly, but it was tested only on 200 Python snippets across 5 libraries. The title says 'LLM-Generated Code' without bounding to Python. §4 acknowledges some limitations but the framing in abstract/title/conclusion overgeneralizes.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No alternative explanations for the results are discussed. For instance, the high precision could partly reflect the simplicity of the curated dataset rather than inherent framework reliability.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures detection/correction on a curated dataset but frames results as 'trustworthy code generation' and 'reliable alternative to probabilistic repair' without discussing the gap between curated-dataset performance and real-world code generation trustworthiness.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "§4 (Discussion and Future Work) contains a substantial paragraph beginning 'We must acknowledge the limitations of this study' discussing dataset size, library coverage, single-file analysis, and scope of targeted errors.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "§4 discusses specific threats: '200-sample dataset...is not exhaustive', 'error distribution may not reflect real-world prevalence', 'Knowledge Base was limited to five Python libraries', 'does not yet handle multi-module dataflows'.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "§4 explicitly states: 'our approach deliberately targets KCHs and does not attempt to solve more complex, multi-line logical errors' and 'currently focuses on single-file, function-level analysis.'",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source is mentioned anywhere in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed with William & Mary affiliation. No product being evaluated is affiliated with the authors.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "KCH defined as 'subtle, semantic errors' with two types: 'API Knowledge Conflicts' (non-existent functions/parameters) and 'Identifier Knowledge Conflicts' (variable misuse). Examples (pd.read_exel, max_len_str) provided.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Clear RQ1/RQ2 framework: deterministic static-analysis for detecting KCHs via AST+KB introspection. Novel: post-processing correction (vs. constrained decoding or LLM-in-the-loop approaches). Contribution explicitly positioned relative to prior work.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Engages with PICARD/Synchromesh (prevention-based), LLM-in-the-loop repair, Structural Trimming (deletion-based). Shows how this work differs (deterministic, correction-focused, resolution vs. deletion). Related work section (§5) references hallucination taxonomy work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Replication package provided at GitHub (ref [3]: https://github.com/WM-SEMERU/Hallucinations-in-Code), explicitly stated as publicly available in §1.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The 200-sample dataset is stated to be part of the replication package: 'All data, code, and experimental configurations are publicly available in our replication package [3]' (§1).",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper. Only a GitHub link is provided.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is mentioned but no instructions for running it are given.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Only point estimates are reported (100% precision, 87.6% recall, 77.0% fix accuracy). No confidence intervals or error bars.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": false,
    154           "answer": false,
    155           "justification": "The paper does not make comparative claims between systems. It evaluates a single deterministic system with no stochastic comparisons requiring significance tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": false,
    160           "answer": false,
    161           "justification": "No comparative claims are made between systems, so effect sizes are not applicable. The paper reports absolute performance of a single system.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The dataset has 200 samples (161 hallucinated, 39 clean) but no justification for why this size was chosen or whether it is adequate for the claims made.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": false,
    172           "answer": false,
    173           "justification": "The system is fully deterministic ('completed in under 0.2 seconds', §2.5). There are no stochastic runs to report variance across.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No baseline systems are compared against. The paper only reports its own framework's performance. Related work discusses PICARD, Synchromesh, LLM-in-the-loop repair, and Structural Trimming but does not compare against them experimentally.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "No baselines are included, so contemporaneity cannot be assessed.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "The framework has multiple components (AST parser, KB, validation rules for unknown API/bare calls/semantic inconsistency, correction module) but no ablation study isolating their contributions.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Reports precision, recall, F1-score for detection, and fix accuracy for correction (Tables 1-4, §3).",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation of the corrections is mentioned. Evaluation is entirely automated. Human evaluation would be relevant to assess whether corrections are semantically appropriate.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "The system is not trained/tuned — it is a deterministic rule-based framework. There is no training/validation/test split concern.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Tables 3 and 4 provide breakdowns by KCH type (Missing Imports, Mis-typed API Calls, Contextual Mismatches) and by library (numpy, pandas, matplotlib, json, requests).",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "§4 provides manual analysis of 37 failed cases (20 false negatives, 17 failed corrections) with specific examples like 'plt.plotx instead of plt.plot' and the surface-typo-vs-semantic-error problem.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Contextual Mismatches had only 33.3% detection rate and 0.0% correction accuracy (Table 3). Pandas had only 56.2% correction accuracy (Table 4). These are clearly negative results.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The dataset was generated using 'GPT-5' (§2.6) but no version, snapshot date, or API version is specified.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Dataset was generated by 'prompting GPT-5 with task-oriented instructions' (§2.6) but the actual prompts used are not provided.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No hyperparameters for GPT-5 generation (temperature, top-p, etc.) are reported. The edit-distance threshold for correction is also not specified.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The framework is a deterministic static analysis pipeline.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "The paper states 200 samples were 'curated' from GPT-5 output but does not describe how many were initially generated, what curation criteria were applied, or how the 161/39 hallucinated/clean split was determined.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Replication package [3] is stated to contain all data, code, and experimental configurations.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "§2.6 describes dataset construction: 200 Python samples generated by prompting GPT-5 for 5 target libraries, composed of 161 hallucinated (3 categories) and 39 clean samples.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Data is LLM-generated code snippets from a standard model, not a benchmark requiring recruitment description.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The paper does not describe how GPT-5 outputs were selected, filtered, or curated into the final 200. How many were generated initially? What criteria determined inclusion? This is undocumented.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "The system being evaluated is a deterministic static analysis tool, not a pre-trained model. Contamination of model training data is not relevant to evaluating this framework.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "The framework is rule-based, not trained. No train/test overlap concern exists.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "The framework is deterministic and not trained on any data. Benchmark contamination is not applicable.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "§2.5 reports: 'the end-to-end analysis of all 200 samples completed in under 0.2 seconds on a single laptop CPU.'",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "The compute is minimal and stated: under 0.2 seconds on a single laptop CPU for all 200 samples (§2.5).",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": false,
    372           "answer": false,
    373           "justification": "The framework is fully deterministic with no random seeds. No stochastic component exists.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": false,
    378           "answer": false,
    379           "justification": "Deterministic system — a single run always produces the same output. Multiple runs are unnecessary.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "The framework likely has tunable parameters (e.g., edit distance threshold for fuzzy matching) but no hyperparameter search is described.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "No discussion of how design choices (e.g., edit distance thresholds, semantic cue definitions) were selected or validated.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors evaluate their own system on their own curated dataset without acknowledging the potential bias of designing both the tool and its evaluation data.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": false,
    408           "answer": false,
    409           "justification": "No comparison with other systems, so compute-matched comparison is not applicable.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The 200-sample curated dataset's representativeness of real-world KCH distribution is not discussed. §4 briefly notes 'error distribution may not reflect real-world prevalence' but does not analyze construct validity.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No scaffolding is involved. The system is a standalone static analysis tool.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": false,
    428           "answer": false,
    429           "justification": "The evaluated system is a deterministic rule-based tool, not a trained model. Temporal leakage of training data is not applicable.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": false,
    434           "answer": false,
    435           "justification": "The evaluated system is rule-based, not a trained model. Feature leakage is not applicable.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": false,
    440           "answer": false,
    441           "justification": "No trained model is evaluated. Non-independence of train/test data is not applicable.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": false,
    446           "answer": false,
    447           "justification": "No trained model is evaluated. Leakage detection is not applicable.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Framework detects KCHs with 100% precision and 87.6% recall on the 200-sample dataset",
    456       "evidence": "Table 1: 141 true positives, 0 false positives, 20 false negatives, 39 true negatives. Precision=141/141=100%, Recall=141/161=87.6%, F1=0.934.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "Framework auto-corrects 77% of identified hallucinations (124/161)",
    461       "evidence": "Table 2: Total hallucinated samples identified = 161, successfully corrected = 124, fix accuracy = 77.0%.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Missing imports have highest detection/correction rate (97.9%)",
    466       "evidence": "Table 3: 48 samples, 97.9% detection rate, 97.9% correction rate.",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "Contextual mismatches are hardest type (33.3% detection, 0% correction)",
    471       "evidence": "Table 3: Only 3 samples of contextual mismatches, 33.3% detection, 0/3 corrected (0% accuracy).",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Deterministic approach is reliable alternative to probabilistic repair methods",
    476       "evidence": "Discussion contrasts with PICARD, Synchromesh, LLM-in-the-loop. No empirical comparison provided; claim is logical but not validated.",
    477       "supported": "moderate"
    478     },
    479     {
    480       "claim": "Pandas library shows lower performance (56.2% correction) due to semantic complexity",
    481       "evidence": "Table 4: pandas 85.1% detection but only 56.2% correction. Discussion notes pandas failures involve 'surface-level typo vs. semantic error' distinction.",
    482       "supported": "moderate"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "case-study"
    488   ],
    489   "key_findings": "The paper presents a static-analysis framework that detects knowledge-conflicting hallucinations (KCHs) in LLM-generated code with 100% precision and 87.6% recall, and auto-corrects 77% of detected errors using AST parsing and library introspection. The framework excels on missing-import errors (97.9%) but struggles with contextual mismatches (0% correction) and library-dependent tasks like pandas (56.2% correction). A critical limitation is that 'functionally correct' code is claimed but only AST structure—not actual execution—is verified.",
    490   "red_flags": [
    491     {
    492       "flag": "No baseline comparisons",
    493       "detail": "Paper claims superiority over PICARD, Synchromesh, LLM-in-the-loop, and mypy but provides zero empirical comparison. These baselines are discussed only conceptually, not tested on the same dataset."
    494     },
    495     {
    496       "flag": "Proxy outcome mismatch",
    497       "detail": "Paper claims 77% fix accuracy for 'functionally correct, runnable code' but explicitly states code is analyzed 'without running the code.' Only AST structure correctness is verified, not actual execution."
    498     },
    499     {
    500       "flag": "Suspiciously high precision (100%)",
    501       "detail": "Perfect precision on 200 manually-curated samples raises overfitting concerns. No confidence intervals or cross-validation. Could indicate dataset is too simple or heavily biased toward easy cases."
    502     },
    503     {
    504       "flag": "Severe variance by category",
    505       "detail": "Contextual mismatches: 0/3 corrected (0%). Pandas: 56.2% correction. This extreme variance (0%–97.9%) suggests poor generalization and high library-dependence."
    506     },
    507     {
    508       "flag": "Missing generation prompts",
    509       "detail": "Dataset created by prompting GPT-5 but actual prompts not provided. Cannot reproduce dataset generation or assess prompt bias."
    510     },
    511     {
    512       "flag": "No inter-rater agreement",
    513       "detail": "200-sample dataset appears single-annotated by authors. No inter-rater reliability study to validate what constitutes a KCH."
    514     },
    515     {
    516       "flag": "Small sample with extreme confidence",
    517       "detail": "200 samples is small. Only 3 contextual-mismatch samples. 100% precision and 77% correction claims on this scale lack robustness."
    518     },
    519     {
    520       "flag": "Generalization not tested",
    521       "detail": "Results limited to 5 libraries (numpy, pandas, requests, matplotlib, json). Unclear if framework works on scipy, tensorflow, torch, or industry libraries."
    522     },
    523     {
    524       "flag": "No cost comparison to baselines",
    525       "detail": "Paper reports 0.2s for framework but does not compare cost/quality tradeoff with LLM-in-the-loop or constrained decoding approaches."
    526     },
    527     {
    528       "flag": "Missing statistical rigor",
    529       "detail": "No confidence intervals, p-values, or significance tests. Single evaluation without bootstrap, cross-validation, or hold-out validation."
    530     }
    531   ],
    532   "cited_papers": [
    533     {
    534       "title": "Bugs in Large Language Models Generated Code: An Empirical Study",
    535       "relevance": "Foundational taxonomy of LLM code bugs; establishes hallucination as a known failure mode."
    536     },
    537     {
    538       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    539       "relevance": "Defines Knowledge Conflicting Hallucinations (KCH) category; prior work this paper builds on."
    540     },
    541     {
    542       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    543       "relevance": "Motivates productivity gains and risks from LLM code tools; field evidence for the problem space."
    544     },
    545     {
    546       "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding",
    547       "relevance": "Constrained decoding baseline; prevents syntax errors but misses semantic KCHs."
    548     },
    549     {
    550       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    551       "relevance": "Prevention-based approach; paper contrasts deterministic repair vs. prevention strategies."
    552     },
    553     {
    554       "title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models",
    555       "relevance": "Error taxonomy; provides context for error classification and mitigation strategies."
    556     },
    557     {
    558       "title": "Mapping the Trust Terrain: LLMs in Software Engineering - Insights and Perspectives",
    559       "relevance": "Trust and developer experience with LLM code generation; motivates reliability of fixes."
    560     },
    561     {
    562       "title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs",
    563       "relevance": "Deletion-based repair approach; paper contrasts trimming (safety) vs. correction (functionality)."
    564     }
    565   ],
    566   "engagement_factors": {
    567     "practical_relevance": {
    568       "score": 2,
    569       "justification": "AST-based hallucination detection for LLM code is directly applicable to developer workflows, though the tool only covers 5 Python libraries currently."
    570     },
    571     "surprise_contrarian": {
    572       "score": 0,
    573       "justification": "The idea that static analysis can catch API misuse is well-understood; the results confirm expectations rather than challenging them."
    574     },
    575     "fear_safety": {
    576       "score": 0,
    577       "justification": "Addresses code correctness rather than safety, security, or misuse concerns."
    578     },
    579     "drama_conflict": {
    580       "score": 0,
    581       "justification": "No controversy or conflict; positions itself as complementary to existing approaches without challenging specific claims."
    582     },
    583     "demo_ability": {
    584       "score": 1,
    585       "justification": "Code is available on GitHub but requires setup with specific libraries and the custom dataset; not a quick-try tool."
    586     },
    587     "brand_recognition": {
    588       "score": 0,
    589       "justification": "From William & Mary's SEMERU lab, not a widely recognized institution in the AI/ML community."
    590     }
    591   },
    592   "hn_data": {
    593     "threads": [
    594       {
    595         "hn_id": "46885582",
    596         "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    597         "points": 3,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=46885582",
    600         "created_at": "2026-02-04T13:28:17Z"
    601       },
    602       {
    603         "hn_id": "47119379",
    604         "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    605         "points": 2,
    606         "comments": 1,
    607         "url": "https://news.ycombinator.com/item?id=47119379",
    608         "created_at": "2026-02-23T08:01:55Z"
    609       },
    610       {
    611         "hn_id": "46811142",
    612         "title": "Anthropic: Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    613         "points": 2,
    614         "comments": 1,
    615         "url": "https://news.ycombinator.com/item?id=46811142",
    616         "created_at": "2026-01-29T15:04:00Z"
    617       },
    618       {
    619         "hn_id": "47477667",
    620         "title": "TinyTorch: Building Machine Learning Systems from First Principles",
    621         "points": 2,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=47477667",
    624         "created_at": "2026-03-22T14:03:42Z"
    625       }
    626     ],
    627     "top_points": 3,
    628     "total_points": 9,
    629     "total_comments": 2
    630   }
    631 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs