scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29454B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Do Data Analysts Respond to AI Assistance? A Wizard-of-Oz Study",
      6     "authors": [
      7       "Ken Gu",
      8       "Madeleine Grunde-McLaughlin",
      9       "Andrew McNutt",
     10       "Jeffrey Heer",
     11       "Tim Althoff"
     12     ],
     13     "year": 2023,
     14     "venue": "International Conference on Human Factors in Computing Systems",
     15     "arxiv_id": "2309.10108",
     16     "doi": "10.1145/3613904.3641891"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's claims are supported: n=13 WoZ study was conducted, suggestion categories were developed from literature review and crowd-sourced analysis, and design guidelines were derived from study findings.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'well-timed and contextual planning assistance often helped analysts consider and make alternative decisions,' a causal claim, but the n=13 WoZ lab study with no control condition is insufficient for causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Generalizations are bounded to high-proficiency data analysts, a single task and dataset, and a 2-hour lab setting; the limitations section explicitly notes constraints on generalizability to novices and longitudinal use.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper notes novelty effects briefly in limitations but does not systematically consider alternative explanations for observed preferences such as social desirability bias or wizard selection effects on what suggestions participants saw.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures analyst ratings (thumbs up/down), suggestion incorporation rates (51.6%), and interview responses, and explicitly frames these as measures of analyst preferences rather than objective improvements in analysis quality.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6.2 'Limitations and Future Work' is a dedicated subsection covering participant selection, study design constraints, and analytical scope.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: self-selected high-proficiency analysts may positively bias results, one-shot 2-hour lab design may not reflect longitudinal adaptation, and lab setting may inflate willingness to accept suggestions vs. real-world stakes.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly excludes novice analysts, focuses on computational notebook context with one dataset and research question, and notes that analysis quality was not evaluated — only analyst preferences.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments disclose funding from NSF grants IIS-1901386 and NSF CAREER IIS-2142794, and the Bill & Melinda Gates Foundation (INV-004841).",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list University of Washington affiliation on the first page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF and the Bill & Melinda Gates Foundation have no direct stake in the design recommendations or tool being studied.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosures, or equity declarations are present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Analysis execution' and 'analysis planning' are explicitly defined in the introduction; 'Wizard of Oz' is cited and the protocol described; the planning vs. execution assistance distinction is clearly established with examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Two explicit contributions are stated: (1) the WoZ study characterizing analyst preferences for planning assistance (labeled as 'our first contribution'), and (2) design guidelines for analysis assistants ('our second contribution').",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages substantively with prior work on analysis planning, LLM-based code assistants (Copilot, ChatGPT), and Human-AI interaction, showing how this work extends rather than merely lists existing contributions.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "JupyterLab assistant interface code is released at https://github.com/behavioral-data/Data-Assistant-Interface; study materials on OSF at https://osf.io/9x8bj/.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "OSF contains 'study materials' (protocols, instruments) but raw session data (recordings, transcripts) is not explicitly confirmed as available; the soccer analysis dataset is from a prior published study.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file or Dockerfile is provided; the paper notes JupyterLab was used on a MacBook Pro but gives no dependency specifications.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are included; the WoZ study requires a trained wizard with a pre-built suggestion spreadsheet, which is not fully documented for independent replication.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Standard deviations are reported for suggestion counts (11.85, std=4.56) but no confidence intervals are reported; main findings are qualitative.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": false,
    155           "answer": false,
    156           "justification": "The study is qualitative and exploratory; no comparative hypothesis tests are conducted or expected.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": false,
    161           "answer": false,
    162           "justification": "Qualitative study with no hypothesis testing; effect sizes are not applicable.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "N=13 is selected on a first-come basis from 60 volunteers with no power analysis or justification for why 13 participants provide sufficient qualitative saturation.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Standard deviation is reported for suggestion counts but the main qualitative findings (category helpfulness, interview themes) have no variance reporting.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": false,
    181           "answer": false,
    182           "justification": "This is an exploratory design study, not a comparative system evaluation; no no-assistant baseline condition is included or expected for this research purpose.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "N/A — no baselines.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "N/A — this is a qualitative design study, not a system with components to ablate.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: analyst ratings per suggestion (positive/neutral/negative), suggestion incorporation rates (47/91 = 51.6%), think-aloud observations, and post-task semi-structured interview data.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "The entire study is human evaluation: 13 practicing data analysts rated and reflected on AI assistant suggestions during a live analysis task.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Not a prediction or classification task.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Figure 5 shows suggestion helpfulness ratings broken down by all eight suggestion categories (domain background, data wrangling, conceptual model formulation, operationalizing constructs, choosing statistical model, model results interpretation, high-level planning, execution assistance).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Specific failure cases are discussed throughout: poorly-timed suggestions ignored by engrossed analysts, suggestions too basic for statistical experts, over-reliance/autopilot behavior, and confusion from unfamiliar statistical recommendations.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results are reported: suggestion categories that were sometimes unhelpful, goal misalignment between analyst and assistant, and over-reliance reducing critical engagement with the analysis.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "ChatGPT is used for generating suggestions but no specific version (GPT-3.5 vs GPT-4) or API snapshot date is specified; only 'OpenAI. 2022. ChatGPT' is cited.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper mentions 'a general prompt introducing the task and dataset' and real-time prompting by the wizard, but no actual prompts are provided in the paper or appendix.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other generation hyperparameters are reported for ChatGPT usage.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The wizard's scaffolding role is described in detail in Figure 3 and Appendix B: wizard observes notebook context, retrieves prewritten suggestions or prompts ChatGPT, may refine outputs via additional prompting, then presents the suggestion to the analyst.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Dataset preprocessing is described: subsampled for computational simplicity while maintaining outcome distribution, limited to the 10 most frequently used variables from the original crowd-sourced study.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Study materials are on OSF but raw session data (recordings, coded transcripts, suggestion logs) is not explicitly confirmed as available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described: Zoom recordings with consent, coordinator notes during analysis and interview phases, one author reviewed recordings and transcribed relevant episodes, two authors conducted iterative open coding.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Recruitment is described: analysis-related mailing lists at UW institution, pool of 60 volunteers, selected those with programming/stats proficiency ≥4/5 familiar with computational notebooks, invited final 13 on first-come basis.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The analysis pipeline is described: recording → transcription of relevant episodes → iterative open coding by two authors to define common themes, with thematic analysis cited.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This study evaluates human analyst behavior in response to suggestions; it does not evaluate model capabilities on a benchmark, so training cutoff is irrelevant.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "N/A — not a benchmark evaluation study.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "N/A — not a benchmark evaluation study.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned anywhere in the paper.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "IRB or ethics board approval is not mentioned; the paper only notes sessions were 'recorded with consent' but provides no institutional review details.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Table 2 reports demographics for all 13 participants: gender, occupation/background, prior AI code assistant experience, preferred programming language, and self-rated language and statistics experience.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Inclusion criteria specified: programming and statistics proficiency ≥4/5 and familiarity with computational notebooks; novices were explicitly excluded because they would be 'too limited by execution challenges to benefit from analysis planning.'",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "All participants received the same WoZ treatment with the same task; randomization is not applicable to this within-subjects exploratory design.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": true,
    346           "justification": "Participant blinding is central to the WoZ protocol and explicitly described: participants were unaware of the wizard's existence and that a human was controlling the assistant from a separate room.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "All 13 participants completed the study but attrition is not explicitly reported; it is only implicit that no one dropped out.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API costs or latency figures for ChatGPT usage are reported; the wizard's real-time generation time is described qualitatively but not measured.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": false,
    365           "answer": false,
    366           "justification": "This is primarily a human subjects study; total compute budget is not a meaningful metric.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Planning assistance can help analysts consider analysis alternatives they would not have identified on their own",
    375       "evidence": "12/13 analysts noted planning suggestions presented ideas not previously considered; 51.6% of actionable planning suggestions were incorporated into working analyses (47/91)",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Suggestion timing is the dominant factor in perceived helpfulness — 9/13 analysts felt 'unhelpful' suggestions would have been useful at a different moment",
    380       "evidence": "Interview data from 9 of 13 participants explicitly citing timing as the cause of rejection, not suggestion content",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Analyst statistical and domain expertise significantly moderates how suggestions are perceived, with experts finding basic suggestions unhelpful",
    385       "evidence": "Qualitative data: A1 (statistics professor) found results-interpretation suggestions unhelpful ('It was giving me comments on my results... I knew already'); A3/A5/A13 ignored unfamiliar suggestions",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Current LLM-based assistants (Copilot, ChatGPT) focus on execution assistance and provide no support for analyst-initiated or assistant-initiated planning assistance",
    390       "evidence": "Conceptual analysis in Figure 2 categorizing existing tools; ChatGPT requires active user prompting and lacks proactive planning mode",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "AI analysis assistance can induce over-reliance, causing some analysts to disengage from critical thinking",
    395       "evidence": "A8 described becoming 'this clicking machine (rather) than an analytical thinker' and going on 'autopilot'; general observations of insufficient validation of suggestions",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "There is a fundamental goal misalignment between analysts (speed, completion) and planning assistants (methodical rigor)",
    400       "evidence": "Observed preference for execution assistance across most analysts; analysts' reluctance to divert from current tasks for planning suggestions; qualitative interview themes",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "qualitative",
    406     "observational"
    407   ],
    408   "key_findings": "A Wizard-of-Oz study with 13 high-proficiency data analysts found that planning assistance is broadly valued but its helpfulness depends far more on contextual factors — timing, analyst expertise, and match with current analysis plan — than on suggestion category. Analysts incorporated actionable planning suggestions 51.6% of the time, and 12/13 noted suggestions presented ideas they had not previously considered. A fundamental tension exists between analyst goals (speed and task completion) and planning assistant goals (methodical rigor), and poorly-timed suggestions were frequently ignored even when substantively useful. Over-reliance emerged as a concern, with some analysts disengaging from critical thinking when suggestions were readily available.",
    409   "red_flags": [
    410     {
    411       "flag": "No control condition",
    412       "detail": "The WoZ study has no no-assistant baseline, making it impossible to attribute observed behaviors to the assistant rather than the task, dataset familiarity, or experimenter effects."
    413     },
    414     {
    415       "flag": "Wizard selection bias",
    416       "detail": "The wizard subjectively decided which suggestions to surface, when, and in what order, introducing significant experimenter influence on what participants experienced and potentially curating a more favorable impression of planning assistance."
    417     },
    418     {
    419       "flag": "Unrepresentative sample",
    420       "detail": "N=13, all from UW mailing lists, all with proficiency ≥4/5, all willing to use AI — the paper acknowledges this may positively bias results toward acceptance of AI assistance."
    421     },
    422     {
    423       "flag": "IRB not mentioned",
    424       "detail": "No ethics board approval is mentioned despite recording participants via Zoom and collecting personal behavioral data; only consent for recording is noted."
    425     },
    426     {
    427       "flag": "No pre-registration",
    428       "detail": "Exploratory qualitative study with no pre-registered research questions or analysis plan, increasing risk of post-hoc narrative construction around observed patterns."
    429     },
    430     {
    431       "flag": "ChatGPT version unspecified",
    432       "detail": "ChatGPT was used for generating suggestions but the model version and parameters are not reported, and the wizard manually corrected outputs, making replication impossible."
    433     },
    434     {
    435       "flag": "Single task generalizability",
    436       "detail": "All participants used the same soccer skin-tone/red-cards dataset with the same research question; the degree to which findings generalize to other analysis contexts, domains, or datasets is untested."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "On the Design of AI-powered Code Assistants for Notebooks",
    442       "relevance": "Direct predecessor studying execution-focused AI assistance in computational notebooks via interview study; this paper explicitly extends it to planning assistance"
    443     },
    444     {
    445       "title": "Many Analysts, One Data Set: Making Transparent How Variations in Analytic Choices Affect Results",
    446       "relevance": "The crowd-sourced analysis study (Silberzahn et al. 2018) that provided the dataset and task used in the WoZ study and motivated the suggestion categories"
    447     },
    448     {
    449       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    450       "relevance": "Prior work on how programmers use code execution assistants; used to contextualize planning assistance as an unexplored complement to execution assistance"
    451     },
    452     {
    453       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    454       "relevance": "Prior usability study of LLM code assistants; findings on expediency and reduced searching are cited as context for analyst preferences"
    455     },
    456     {
    457       "title": "To trust or to think: cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making",
    458       "relevance": "Key reference for AI over-reliance concern and design mechanisms to counteract it; directly motivates the paper's design guideline on preventing autopilot behavior"
    459     },
    460     {
    461       "title": "Paths Explored, Paths Omitted, Paths Obscured: Decision Points & Selective Reporting in End-to-End Data Analysis",
    462       "relevance": "Shows analysts overlook decision alternatives during analysis; motivates the need for planning assistance to surface these alternatives"
    463     },
    464     {
    465       "title": "Hypothesis Formalization: Empirical Findings, Software Limitations, and Design Implications",
    466       "relevance": "Empirical study of analysis challenges and planning difficulties; used to motivate the study design and suggestion categories"
    467     },
    468     {
    469       "title": "Boba: Authoring and Visualizing Multiverse Analyses",
    470       "relevance": "Multiverse analysis tool that planning assistants could complement; cited as part of the ecosystem where planning assistance enables systematic alternative consideration"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Directly applicable to designers building AI coding/analysis assistants (Copilot, ChatGPT plugins, Jupyter AI); the 7 design guidelines are actionable for practitioners."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "The finding that planning assistance is broadly ignored by the field despite analyst need, and that timing matters more than content type, challenges the prevailing focus on code execution assistance."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "No AI risk or safety concerns raised; over-reliance is discussed as a usability issue, not a safety hazard."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "The over-reliance finding ('clicking machine rather than analytical thinker') provides a mild cautionary angle but the paper is not controversial."
    489     },
    490     "demo_ability": {
    491       "score": 1,
    492       "justification": "Code for the JupyterLab interface is released on GitHub but the WoZ setup (requiring a human wizard) is not directly demoed; the interface itself can be explored."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "University of Washington; Jeffrey Heer is a prominent HCI/visualization researcher, providing moderate name recognition in the field."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "38670465",
    503         "title": "An Empirical Study and Evaluation of Modern CAPTCHAs",
    504         "points": 362,
    505         "comments": 329,
    506         "url": "https://news.ycombinator.com/item?id=38670465"
    507       },
    508       {
    509         "hn_id": "34878486",
    510         "title": "On the Mathematics of Diffusion Models",
    511         "points": 17,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=34878486"
    514       },
    515       {
    516         "hn_id": "35715781",
    517         "title": "Fundamental Limitations of Alignment in Large Language Models",
    518         "points": 5,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=35715781"
    521       },
    522       {
    523         "hn_id": "37164389",
    524         "title": "An Empirical Study and Evaluation of Modern CAPTCHAs",
    525         "points": 4,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=37164389"
    528       },
    529       {
    530         "hn_id": "36795746",
    531         "title": "Android in the Wild: A Large-Scale Dataset for Android Device Control",
    532         "points": 3,
    533         "comments": 1,
    534         "url": "https://news.ycombinator.com/item?id=36795746"
    535       },
    536       {
    537         "hn_id": "46203378",
    538         "title": "Are most sentences unique? An empirical examination of Chomskyan claims",
    539         "points": 3,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=46203378"
    542       },
    543       {
    544         "hn_id": "37198551",
    545         "title": "Empirical Study and Evaluation of Modern CAPTCHAs",
    546         "points": 3,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=37198551"
    549       },
    550       {
    551         "hn_id": "35695349",
    552         "title": "Fundamental Limitations of Alignment in Large Language Models",
    553         "points": 3,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=35695349"
    556       },
    557       {
    558         "hn_id": "37668819",
    559         "title": "ElasticNotebook: Enabling Live Migration for Computational Notebooks",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=37668819"
    563       },
    564       {
    565         "hn_id": "37581838",
    566         "title": "Unsupervised Learning via Network-Aware Embeddings",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=37581838"
    570       }
    571     ],
    572     "top_points": 362,
    573     "total_points": 404,
    574     "total_comments": 330
    575   }
    576 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs