scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27811B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis",
      6     "authors": [
      7       "Dipin Khati",
      8       "Daniel Rodriguez-Cardenas",
      9       "Paul Pantzer",
     10       "Denys Poshyvanyk"
     11     ],
     12     "year": 2026,
     13     "venue": "FORGE '26 (IEEE/ACM Third International Conference on AI Foundation Models and Software Engineering)",
     14     "arxiv_id": "2601.19106",
     15     "doi": "10.1145/3793655.3793725"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are supported: KCHs are explained with examples, existing mitigations are discussed in §1, and empirical results match stated performance (100% precision, 87.6% recall, 77.0% fix rate).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper makes no causal claims about mechanisms ('X causes Y'); it demonstrates detection/correction works empirically but does not claim to explain why hallucinations occur or why the deterministic approach succeeds mechanistically.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly bounds scope: 'limited to five Python libraries', 'single-file, function-level analysis', and 'error distribution may not reflect real-world prevalence', while noting potential extension to other languages with AST support.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper compares against existing approaches (constrained decoding, LLM-in-the-loop, deletion-based repair) but does not discuss alternative explanations for why their results hold (e.g., whether high precision is due to dataset properties, or whether 100% is inflated by easy cases).",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims to measure 'fix accuracy' as 'functionally correct, runnable code' but the evaluation is non-executing. No mechanism for validating that corrected code is actually correct is described (e.g., no ground truth comparison, no human review, no execution).",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 4 'Discussion and Future Work' includes a dedicated limitations paragraph acknowledging dataset size, library scope, and architectural constraints.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are stated: '200-sample dataset is not exhaustive', 'Knowledge Base limited to five Python libraries', 'single-file analysis does not handle multi-module dataflows', and approach 'does not attempt to solve multi-line logical errors'.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicit scope boundaries: targets KCHs only (API + identifier conflicts), evaluated on Python snippets, limited to five libraries, single-file function-level analysis, and not addressing complex multi-line logical errors.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source disclosed in abstract, body, or visible acknowledgments section.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list affiliation with William & Mary, and no evaluated product is developed by the authors or institution, so conflict is minimal.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder disclosed; NA.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of patents, equity, or consulting relationships provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: KCHs as 'code that flat-out contradicts the established, factual knowledge of a programming language or its libraries', AST parsing, and 'Dynamic Knowledge Base' via introspection are explained.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution is explicit: a deterministic post-processing framework for detecting AND correcting (not just deleting) KCHs in LLM code, positioned against prevention, LLM-in-the-loop, and deletion approaches.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "§1 and §5 systematically engage with prior work (taxonomy [11], KCH definition [6], prevention [8,10], LLM-in-the-loop [1,9], deletion [14], type-checkers [5]), positioning the deterministic correction approach as novel.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Paper states 'All data, code, and experimental configurations are publicly available in our replication package [3]' linking to https://github.com/WM-SEMERU/Hallucinations-in-Code.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The 200-sample evaluation dataset is stated to be in the replication package alongside code, enabling independent verification.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Python 3 is implied but no requirements.txt, Dockerfile, or dependency specifications provided. No virtual environment or package versions documented.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Paper provides high-level methodology but no step-by-step reproduction instructions. Replication package may contain these, but they are not included in the paper itself.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No confidence intervals, error bars, or variance measures reported. Precision/recall/F1 are single point estimates without uncertainty quantification.",
    149         "source": "haiku"
    150       },
    151       "significance_tests": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "No statistical significance tests, cross-validation, or bootstrapping reported. No p-values or hypothesis testing for comparative claims.",
    155         "source": "haiku"
    156       },
    157       "effect_sizes_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Effect sizes provided: 100% precision, 87.6% recall, F1=0.934, 77% fix accuracy, with per-type and per-library breakdowns (Tables 3–4).",
    161         "source": "haiku"
    162       },
    163       "sample_size_justified": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Sample size (n=200 total, 161 hallucinated, 39 clean) is not justified. No power analysis or rationale provided for why 200 is adequate.",
    167         "source": "haiku"
    168       },
    169       "variance_reported": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "Variance/std dev not reported. Only single point estimates; no repeated runs or error margins across samples or folds.",
    173         "source": "haiku"
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "No empirical baselines compared. PICARD, Synchromesh, LLM-in-the-loop, and Structural Trimming are discussed but not experimentally evaluated.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "NA—no baselines included.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "No ablation study. The system has four components (AST parsing, KB construction, validation, correction) but no variant testing (e.g., KB vs no KB).",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Detection metrics (precision, recall, F1), correction accuracy, and per-category breakdowns (Tables 3–4 by type and library) provide multiple evaluation angles.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": false,
    203           "justification": "Dataset is manually curated, but no human evaluation of system outputs or corrected code is reported.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "Not a prediction task; no train/test split or held-out evaluation strategy described.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Table 3 breaks down by hallucination type (Missing Imports, Mis-typed API, Contextual Mismatches); Table 4 by library (numpy, pandas, matplotlib, json, requests).",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Manual analysis of 37 failed cases (20 false negatives, 17 failed corrections) is discussed, revealing matplotlib.pyplot struggles and pandas correction weakness (56.2% vs 97.9% for imports).",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Lower performance on contextual mismatches (33.3% detect, 0% correct) and pandas (56.2% correction) is transparently reported, along with discussion of limitations.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "Dataset generation via 'GPT-5 with task-oriented instructions' is mentioned but no actual prompts, model version (snapshot), or hyperparameters (temperature, top-p) provided for reproducibility.",
    235         "source": "haiku"
    236       },
    237       "model_versions_specified": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "'GPT-5' is named but no API version, snapshot date, or configuration parameters given.",
    241         "source": "haiku"
    242       },
    243       "prompts_provided": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No actual prompts or instructions provided for GPT-5 data generation; only high-level description 'task-oriented instructions for five target libraries'.",
    247         "source": "haiku"
    248       },
    249       "hyperparameters_reported": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No temperature, top-p, max_tokens, or other sampling parameters reported for GPT-5. No hyperparameters for the framework itself (O(n·m) complexity is noted but no tuning parameters).",
    253         "source": "haiku"
    254       },
    255       "scaffolding_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "NA—the framework is deterministic static analysis, not an agent with scaffolding.",
    259         "source": "haiku"
    260       },
    261       "data_preprocessing_documented": {
    262         "applies": true,
    263         "answer": true,
    264         "justification": "Dataset construction is documented: curated to contain 161 hallucinated samples in three categories (Mis-typed APIs, Missing imports, Contextual mismatches) and 39 clean samples from five libraries.",
    265         "source": "haiku"
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "Paper claims 'All data, code, and experimental configurations are publicly available in our replication package [3]' on GitHub.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection via GPT-5 prompting is described; dataset composition (161 hallucinated, 39 clean) and categories are documented.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "NA—no human subjects; synthetic dataset from LLM prompting.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Framework pipeline is well-documented in §2: Static Analysis → Dynamic KB → Deterministic Validation → Automated Correction, with each component explained.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "NA—paper does not evaluate pre-trained models on benchmarks; it tests a deterministic tool on a synthetic dataset.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "NA—same as above.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "NA—same as above.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "NA—no human subjects.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "NA—no human subjects.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "NA—no human subjects.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "NA—no human subjects.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "NA—no human subjects.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "NA—no human subjects.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "NA—no human subjects.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Runtime is reported: 'end-to-end analysis of all 200 samples completed in under 0.2 seconds on a single laptop CPU', demonstrating practical efficiency.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No compute budget stated for dataset generation (GPT-5 API costs) or evaluation infrastructure.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Large Language Models frequently produce Knowledge Conflicting Hallucinations (KCHs)—semantic errors like non-existent API parameters that evade linters and cause runtime failures.",
    376       "evidence": "Examples given (pd.read_exel), cited prior work [11, 12, 6], but not quantified in this paper.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Constrained decoding methods (PICARD, Synchromesh) fail to catch semantic errors because they only enforce syntactic validity.",
    381       "evidence": "Discussed in §1 and §5; no empirical comparison provided.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "A deterministic static-analysis framework using AST parsing and library introspection can detect KCHs with 100% precision (zero false positives).",
    386       "evidence": "Table 1: 141 TP, 0 FP out of 200 samples, achieving 100% precision.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "The framework achieves 87.6% recall in KCH detection, identifying 141 of 161 hallucinated samples.",
    391       "evidence": "Table 1: 141 TP, 20 FN, F1=0.934.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "The framework can automatically correct 77% of detected hallucinations, producing functionally correct code.",
    396       "evidence": "Table 2: 124 of 161 detected hallucinations corrected. However, no validation method is described (code is not executed; no ground truth comparison stated).",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Performance varies significantly by hallucination type: Missing Imports (97.9% detect, 97.9% correct), Mis-typed APIs (84.5% detect, 70.0% correct), Contextual Mismatches (33.3% detect, 0% correct).",
    401       "evidence": "Table 3 provides detailed breakdown by type.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "The deterministic approach is computationally efficient, analyzing all 200 samples in under 0.2 seconds on a laptop CPU.",
    406       "evidence": "Stated in §2.5 and §4.",
    407       "supported": "strong"
    408     },
    409     {
    410       "claim": "The framework is a viable alternative to non-deterministic LLM-in-the-loop repair.",
    411       "evidence": "Discussed in §1 and §4 as discussion point, but not empirically compared.",
    412       "supported": "weak"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval"
    417   ],
    418   "key_findings": "A deterministic, static-analysis framework leveraging Abstract Syntax Trees and library introspection via dynamic knowledge base construction can detect Knowledge Conflicting Hallucinations (KCHs) in LLM-generated Python code with 100% precision and 87.6% recall (F1=0.934), automatically correcting 77% of identified errors. Performance varies by error type: Missing Imports are highly recoverable (97.9% detect, 97.9% correct), Mis-typed APIs moderate (84.5% detect, 70.0% correct), and Contextual Mismatches poorly handled (33.3% detect, 0% correct), suggesting that semantic-intent errors remain intractable for simple string-matching approaches. The framework runs efficiently in <0.2 seconds for 200 samples, but evaluation is limited to 200 manually-curated samples across five Python libraries, raising questions about real-world prevalence and generalizability.",
    419   "red_flags": [
    420     {
    421       "flag": "No empirical baseline comparison",
    422       "detail": "Claims superiority over PICARD, Synchromesh, LLM-in-the-loop repair, and mypy but provides no direct experimental comparison. Comparisons are only qualitative discussion."
    423     },
    424     {
    425       "flag": "Small, manually curated dataset may not reflect real-world error distribution",
    426       "detail": "200 samples (161 hallucinated, 39 clean) is acknowledged as potentially biased. Authors note 'error distribution may not reflect real-world prevalence'."
    427     },
    428     {
    429       "flag": "Correction verification method not stated",
    430       "detail": "Paper claims 'fix accuracy' by measuring 'functionally correct, runnable code' but the approach is explicitly non-executing. No ground truth comparison, human review, or execution validation described."
    431     },
    432     {
    433       "flag": "Limited generalizability",
    434       "detail": "Evaluation restricted to Python; Knowledge Base limited to five libraries (numpy, pandas, requests, matplotlib, json). Claim of generalizability to Java/TypeScript is speculative."
    435     },
    436     {
    437       "flag": "No confidence intervals or statistical testing",
    438       "detail": "Single point estimates for precision, recall, F1 without uncertainty quantification, confidence intervals, or cross-validation."
    439     },
    440     {
    441       "flag": "GPT-5 dataset generation not reproducible",
    442       "detail": "Prompts, model version (snapshot date), temperature, and hyperparameters for GPT-5 not provided. Cannot regenerate the evaluation dataset independently."
    443     },
    444     {
    445       "flag": "Contextual Mismatches nearly undetectable",
    446       "detail": "Only 3 samples (1.5% of dataset); 33.3% detection, 0% correction. This critical category is under-represented and handled poorly."
    447     },
    448     {
    449       "flag": "Pandas performance significantly lower",
    450       "detail": "Pandas achieves only 56.2% correction accuracy vs 93.8% for numpy and 93.9% for requests, but no analysis of why or how to improve."
    451     },
    452     {
    453       "flag": "No human evaluation of corrections",
    454       "detail": "Corrected code samples not reviewed by developers or automated validators to confirm functional correctness."
    455     },
    456     {
    457       "flag": "Missing environment and reproduction specifications",
    458       "detail": "No requirements.txt, Dockerfile, or dependency versions provided. Replication package may exist but paper itself lacks these details."
    459     }
    460   ],
    461   "cited_papers": [
    462     {
    463       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    464       "authors": "Liu, F., Liu, Y., Shi, L., et al.",
    465       "arxiv_id": "2404.00971",
    466       "year": 2024,
    467       "relevance": "Defines KCH (Knowledge Conflicting Hallucinations) taxonomy and benchmarks; foundational reference for this paper's problem statement."
    468     },
    469     {
    470       "title": "Bugs in Large Language Models Generated Code: An Empirical Study",
    471       "authors": "Tambon, F., Moradi Dakhel, A., et al.",
    472       "arxiv_id": "2403.08937",
    473       "year": 2024,
    474       "relevance": "Early taxonomy of LLM code generation bugs; establishes prevalence of hallucinations in the field."
    475     },
    476     {
    477       "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges",
    478       "authors": "Lee, Y., Song, J. Y., Kim, D., et al.",
    479       "arxiv_id": "2504.20799",
    480       "year": 2025,
    481       "relevance": "Comprehensive survey of hallucination types and mitigation strategies; directly relevant to positioning this work."
    482     },
    483     {
    484       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    485       "authors": "Peng, S., Kalliamvakou, E., Cihon, P., Demirer, M.",
    486       "arxiv_id": "2302.06590",
    487       "year": 2023,
    488       "relevance": "Establishes productivity gains from LLM code generation; motivates the need for hallucination mitigation."
    489     },
    490     {
    491       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    492       "authors": "Poesia, G., Polozov, O., Le, V., et al.",
    493       "arxiv_id": "2201.11227",
    494       "year": 2022,
    495       "relevance": "Constrained decoding approach for code generation; example of prevention strategy that misses semantic errors."
    496     },
    497     {
    498       "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding from Language Models",
    499       "authors": "Scholak, T., Schucher, N., Bahdanau, D.",
    500       "year": 2021,
    501       "relevance": "Foundational constrained decoding method for grammar enforcement; shown to miss KCHs."
    502     },
    503     {
    504       "title": "Static Analysis as a Feedback Loop: Enhancing LLM-Generated Code Beyond Correctness",
    505       "authors": "Blyth, S., Licorish, S. A., Treude, C., Wagner, M.",
    506       "arxiv_id": "2508.14419",
    507       "year": 2025,
    508       "relevance": "LLM-in-the-loop repair strategy; represents non-deterministic approach this paper positions against."
    509     },
    510     {
    511       "title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs",
    512       "authors": "Zhang, Y.",
    513       "year": 2025,
    514       "relevance": "AST-based deletion approach for safety; represents deletion-based mitigation that this paper extends toward correction."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 2,
    520       "justification": "Tool could be integrated into IDEs for real-time code-generation validation, directly useful for practitioners, but limited scope (5 libraries, single-file) reduces immediate applicability."
    521     },
    522     "surprise_contrarian": {
    523       "score": 1,
    524       "justification": "Using static analysis for code correctness is well-established (mypy, linters); applying it post-hoc to LLM hallucinations is incremental rather than conceptually novel."
    525     },
    526     "fear_safety": {
    527       "score": 2,
    528       "justification": "Addresses a real safety concern (LLM-generated code causing runtime failures), positioning deterministic checking as a trust-building mechanism for AI-assisted development."
    529     },
    530     "drama_conflict": {
    531       "score": 0,
    532       "justification": "Straightforward technical paper with no controversy, competing claims, or dramatic narrative elements."
    533     },
    534     "demo_ability": {
    535       "score": 1,
    536       "justification": "Code is open-source on GitHub and can be demoed locally, but requires setup (library introspection); not immediately web-demoable or friction-free."
    537     },
    538     "brand_recognition": {
    539       "score": 1,
    540       "justification": "William & Mary's SEMERU Lab is respected in software engineering research but not a top-tier AI lab; lead author Dipin Khati not widely known in the field."
    541     }
    542   },
    543   "hn_data": {
    544     "threads": [
    545       {
    546         "hn_id": "46885582",
    547         "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    548         "points": 3,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=46885582",
    551         "created_at": "2026-02-04T13:28:17Z"
    552       },
    553       {
    554         "hn_id": "47119379",
    555         "title": "Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    556         "points": 2,
    557         "comments": 1,
    558         "url": "https://news.ycombinator.com/item?id=47119379",
    559         "created_at": "2026-02-23T08:01:55Z"
    560       },
    561       {
    562         "hn_id": "46811142",
    563         "title": "Anthropic: Who's in Charge? Disempowerment Patterns in Real-World LLM Usage",
    564         "points": 2,
    565         "comments": 1,
    566         "url": "https://news.ycombinator.com/item?id=46811142",
    567         "created_at": "2026-01-29T15:04:00Z"
    568       },
    569       {
    570         "hn_id": "47477667",
    571         "title": "TinyTorch: Building Machine Learning Systems from First Principles",
    572         "points": 2,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=47477667",
    575         "created_at": "2026-03-22T14:03:42Z"
    576       }
    577     ],
    578     "top_points": 3,
    579     "total_points": 9,
    580     "total_comments": 2
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs