scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29229B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace",
      6     "authors": [
      7       "Jenna Butler",
      8       "Jina Suh",
      9       "Sankeerti Haniyur",
     10       "Constance Hadley"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv",
     14     "arxiv_id": "2410.18334",
     15     "doi": "10.1145/nnnnnnn.nnnnnnn"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All major abstract claims are supported in the results: significantly increased usefulness (p=0.001) and enjoyment (p<0.0001), unchanged trust, 84% positive work changes, and 66% feeling changes are documented through surveys and diary coding.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Although an RCT design was used, the primary causal claim that Copilot 'significantly increased' positive beliefs is based on within-group paired t-tests for the treatment group only; the paper does not present treatment-vs-control belief comparisons, so the control group's belief trajectory during the same period is not shown.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion states 'generative AI tools are changing work, mostly for the better' and makes broad organizational recommendations without bounding claims to the single-company, 3-week, primarily-male developer population studied.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper explicitly considers that the contemporaneous surge in AI news coverage — not Copilot use — may explain why both treatment and control groups increased in believing they have unique technical skills; multiple explanations for null telemetry results are also discussed.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper carefully distinguishes self-reported productivity (surveys and diaries) from objective productivity (telemetry), explicitly noting that telemetry showed no statistically significant differences even while beliefs improved.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 is a dedicated 'LIMITATIONS' section covering self-report biases, unvalidated survey instruments, and single-company generalizability.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are named: 25% control group contamination (used GenAI tools anyway), 25% treatment non-compliance, study may be too short (citing research suggesting 11 weeks needed for tipping point), post-hoc power as low as 0.06 for code changes, and coding confounders (meetings, oncall, vacation).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The limitations note single-company scope but defend it via Flyvbjerg's case-study argument; there are no explicit statements of what the results do NOT show, and the conclusion makes broad recommendations for organizations generally.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding statement is provided anywhere in the paper; three of four authors are Microsoft employees conducting research on a Microsoft product, constituting implicit institutional funding that is never formally disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list institutional affiliations clearly in the header: Butler, Suh, and Haniyur at Microsoft; Hadley at Institute for Work Life.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Three of four authors are Microsoft employees evaluating GitHub Copilot, a Microsoft/GitHub product; the institution has direct financial interest in favorable findings and is not independent of the outcome.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial interests declaration appears in the paper; the acknowledgments mention Microsoft colleagues but do not address potential conflicts.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are operationally defined: 'usefulness,' 'enjoyment,' and 'trust' are measured via specific Likert statements; 'Copilot experience' is defined by explicit usage frequency categories; productivity is measured via both self-report and multiple telemetry metrics.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly states its contribution as 'one of the first randomized controlled trials of GitHub Copilot in a real-world work environment' examining effects on both quantitative coding data and developers' beliefs and values.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 systematically engages with TAM-related adoption literature and prior Copilot studies (Peng et al., Chatterjee et al., Imai, Zhang et al.), explicitly positioning this work as extending from controlled lab settings to the real workplace.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No analysis code is released; supplemental material [5] on Zenodo contains only the survey instrument, not data processing or statistical analysis scripts.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Raw survey responses, diary entries, and telemetry data are not publicly available; only the survey instrument is accessible via Zenodo.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": false,
    134           "answer": false,
    135           "justification": "This is a human subjects study; GitHub Copilot is a commercial tool with no researcher-controlled software environment to specify.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step instructions for replicating the study design, recruitment, or analysis pipeline are provided.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The paper reports means and p-values throughout but no confidence intervals or error bars for any main results, including Likert scale changes and DiD estimates in Table 2.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "The paper uses paired t-tests for belief changes, chi-square for group equivalence checks, Kruskal-Wallis for diary distribution comparison, and difference-in-difference for telemetry outcomes.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "Mean differences on Likert scales are reported (e.g., 2.72→3.61) but no standardized effect sizes (Cohen's d, eta-squared) are calculated; Table 2 DiD coefficients are on raw scales without standardization.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No a priori power analysis or sample size justification is provided; post-hoc power in Table 2 reveals critical underpowering (as low as 0.06 for code changes), which is acknowledged retrospectively but not addressed prospectively.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Standard deviation is reported only for diary submission counts (mean 8.37, SD 4.819); Likert scale comparisons report means without variance, and DiD coefficients lack standard errors in the table.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The concurrent control group (developers not using Copilot) and 'Continuing' group (prior users) serve as baselines for both telemetry and belief comparisons.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The control group is concurrent (same time period, summer 2023), making it a contemporary baseline with no temporal confounding.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "This is a tool adoption study evaluating GitHub Copilot as a whole product; component ablation is not applicable.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The study combines multiple Likert-scale belief measures, qualitative diary entries with open coding, and six objective telemetry metrics (code changes, PRs, development time, PR hours, email time, build time).",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "The entire study is a human evaluation: 106 software engineers assessed Copilot through daily diary reflections and pre/post surveys on perceived usefulness, enjoyment, trust, and work impact.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is an RCT/diary study of developer behavior, not a prediction task; held-out test sets are not applicable.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by prior experience (experienced vs. inexperienced), by group (treatment/control/continuing), by engineering level (junior/senior/principal), and by use case category in diary coding.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 4.3.2 'Challenges' explicitly discusses failure modes: hallucinations, syntactically correct but semantically wrong code, poor support for niche languages/file types (Android bp/mk, YAML, JSON config), and cases where validation overhead negated productivity gains.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The telemetry analysis found no statistically significant differences between treatment and control on any of six metrics (p-values 0.5–0.9); this null result is reported clearly in Table 2 and discussed at length.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "GitHub Copilot is named but no model version, API version, or snapshot date is specified; the study was conducted around summer 2023 but Copilot's underlying model changed during this period.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The full survey instrument is available via a Zenodo DOI [5], and the daily diary question structure is described in Section 3.6; study instruments are sufficiently accessible.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "GitHub Copilot is a black-box commercial tool with no researcher-controlled hyperparameters; not applicable.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is involved; the study evaluates organic adoption and use of an existing commercial IDE plugin.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The paper describes 6-week baseline telemetry collection, consent procedures, country-based exclusions, parallel-trends validation for DiD, and open coding procedures for qualitative diary and survey data.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw survey responses, diary entries, and telemetry data are not publicly released; only the survey instrument is on Zenodo.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Data collection is described in detail: daily Teams message diaries, intake and exit surveys with example questions, telemetry from the corporate engineering system requiring written consent, and a 6-week baseline collection period.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Recruitment is described: starting from 10,000 randomly selected engineers, 337 completed intake survey, 269 agreed to participate, 228 remained after country exclusions, 106 reached the compliant final population.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline is documented: recruitment → intake survey → block randomization → daily diary → exit survey → telemetry DiD analysis, with analysis methods (paired t-test, open coding, DiD, chi-square) explained for each data type.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This study evaluates developer adoption and beliefs, not model capabilities on benchmarks; training cutoff is not relevant.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not evaluating model capabilities on benchmarks; not applicable.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Not evaluating model capabilities on benchmarks; not applicable.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned anywhere in the paper; this is a notable omission for an RCT, as it allows post-hoc emphasis on outcomes that showed significant results (beliefs) over those that did not (telemetry).",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "The acknowledgments state: 'The ethics for this study were reviewed and approved by the Microsoft Research Institutional Review Board (MSRIRB), which is an IRB federally registered with the United States Department of Health & Human Services.'",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Demographics are reported for both intake (n=228) and final (n=106) populations including gender, management level, engineering seniority, and primary programming language, broken down by group (treatment/control/continuing) in Table 1.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Criteria are described: engineers from the organization, restricted to allowed countries, required to complete at least 1 diary, and required self-attested compliance with group assignment verified in exit survey.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": true,
    339           "justification": "Block randomization is described: developers without prior Copilot experience were randomly assigned to treatment/control, stratified on gender and two prior-belief items ('I like AI coding tools,' 'I trust AI coding tools'); chi-square confirmed group balance.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "Blinding is not feasible in a tool adoption study where participants must actively choose to use or not use GitHub Copilot; the study is inherently open-label.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": true,
    351           "justification": "Attrition is explicitly reported: 228 intake → 106 final (53% retention), with specific causes: 25% of treatment never used Copilot, 25% of control used GenAI tools during study, others failed to complete diaries or exit survey.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "This is a human subjects study evaluating commercial tool adoption; inference cost is not applicable.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "No computational budget is relevant to this human subjects research study.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Three weeks of GitHub Copilot use significantly increased developers' belief that AI tools are useful (mean 2.93→3.51, p=0.001) and enjoyable (mean 2.72→3.61, p<0.0001)",
    374       "evidence": "Paired t-tests on treatment group Likert responses before and after 3-week study period",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Developers' trust in AI-generated code did not change significantly after using Copilot",
    379       "evidence": "No statistically significant change in trust-related Likert items; ~20% trust AI code before and after",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "GitHub Copilot access produced no statistically significant change in objective telemetry metrics including code changes, PRs, and development time",
    384       "evidence": "DiD analysis across 6 metrics; p-values range 0.5–0.9; post-hoc power as low as 0.06; Table 2",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "84% of participants reported positive changes in how they work after using Copilot",
    389       "evidence": "Open coding of 94 exit survey verbatim responses; 84% of 129 total codes were positive",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Developers with prior Copilot experience were significantly more likely to believe tools are useful (86% vs 44%) and enjoyable (72% vs 43%)",
    394       "evidence": "Chi-square comparison of experienced vs. inexperienced users on intake survey; p<0.05 for both measures",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Developers discovered unexpected uses for Copilot including web search replacement and creative ideation",
    399       "evidence": "Qualitative diary coding identified web search replacement as a common unanticipated use case, with multiple supporting verbatim quotes",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "rct",
    405     "qualitative",
    406     "observational"
    407   ],
    408   "key_findings": "This workplace RCT (n=106 compliant, 3-week treatment) found that GitHub Copilot use significantly increased developers' positive beliefs about AI tool usefulness and enjoyment, but did not change trust in AI-generated code. Objective telemetry metrics showed no statistically significant productivity effect in any of six measures, likely due to study duration (3 weeks may be too short), severe underpowering (post-hoc power as low as 0.06), and 25% contamination in both arms. Qualitative diary data revealed diverse unexpected use cases — particularly web search replacement — and 84% of treatment participants reported positive changes to how they work, though self-report bias from Microsoft-affiliated researchers evaluating a Microsoft product is a notable confound.",
    409   "red_flags": [
    410     {
    411       "flag": "Microsoft authors evaluating Microsoft product, no COI disclosure",
    412       "detail": "Three of four authors are Microsoft employees evaluating GitHub Copilot (a Microsoft/GitHub product); no conflict of interest or funding is disclosed despite direct institutional financial stake."
    413     },
    414     {
    415       "flag": "Causal belief claims from within-group analysis, not RCT comparison",
    416       "detail": "The headline causal claim that Copilot increased usefulness and enjoyment beliefs is based on within-treatment paired t-tests; the paper never presents treatment-vs-control belief comparisons, so the control group's belief trajectory during the same period is unknown."
    417     },
    418     {
    419       "flag": "Severe bilateral contamination",
    420       "detail": "25% of the control group self-reported using GenAI tools during the study, and 25% of the treatment group reported not using Copilot; this 50% combined non-compliance severely undermines experimental validity."
    421     },
    422     {
    423       "flag": "Critically underpowered telemetry analysis",
    424       "detail": "Table 2 shows post-hoc statistical power as low as 0.06 for code changes; the null telemetry result likely reflects inadequate power rather than a true absence of effect."
    425     },
    426     {
    427       "flag": "No pre-registration for RCT",
    428       "detail": "An RCT with no pre-registration allows post-hoc emphasis on outcomes that showed significant results (beliefs) over those that did not (telemetry), creating potential for selective reporting."
    429     },
    430     {
    431       "flag": "No confidence intervals on any main results",
    432       "detail": "All results report only means and p-values; absence of CIs prevents readers from assessing precision of estimates, practical significance, or whether effects are clinically/practically meaningful."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    438       "relevance": "Key prior RCT of Copilot in lab setting showing 55.8% time reduction on HTTP server task; this paper extends to real workplace over 3 weeks"
    439     },
    440     {
    441       "title": "The Impact of AI Tool on Engineering at ANZ Bank: An Empirical Study on GitHub Copilot within Corporate Environment",
    442       "relevance": "Corporate Copilot study showing 42.36% productivity boost via controlled programming tasks; prior workplace evidence this study contextualizes"
    443     },
    444     {
    445       "title": "Is GitHub Copilot a Substitute for Human Pair-Programming? An Empirical Study",
    446       "relevance": "Finds Copilot increases lines of code but at lower quality than human pair programming; directly relevant to code quality concerns raised in diary study"
    447     },
    448     {
    449       "title": "GitHub Copilot AI pair programmer: Asset or Liability?",
    450       "relevance": "Compares Copilot to humans on fundamental coding problems; finds humans succeed more but Copilot-introduced bugs are easier to fix"
    451     },
    452     {
    453       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    454       "relevance": "Found ~40% of generated programs contain security vulnerabilities; cited to validate developer concerns about code correctness seen in diaries"
    455     },
    456     {
    457       "title": "Practices and Challenges of Using GitHub Copilot: An Empirical Study",
    458       "relevance": "Analyzed Stack Overflow and GitHub Discussions on Copilot usage patterns; provides context for real-world usage challenges and language support findings"
    459     },
    460     {
    461       "title": "Using AI-Based Coding Assistants in Practice: State of Affairs, Perceptions, and Ways Forward",
    462       "relevance": "Survey of developer attitudes toward AI coding assistants including preference for test and documentation writing; directly relevant to belief formation and use case findings"
    463     },
    464     {
    465       "title": "Early Results from a Study of GenAI Adoption in a Large Brazilian Company: The Case of Globo",
    466       "relevance": "Another corporate GenAI adoption study; comparative workplace evidence for adoption patterns"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Organizations adopting AI coding tools can directly apply findings on adoption barriers, the belief-vs-telemetry disconnect, and the 11-week tipping point context for realistic expectation-setting."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "The finding that enjoyment increases but trust does not, and that objective telemetry shows no significant effect while subjective satisfaction improves, challenges the dominant narrative that Copilot straightforwardly improves developer output."
    477     },
    478     "fear_safety": {
    479       "score": 1,
    480       "justification": "The paper documents developer fears about AI replacement and code quality risks (hallucinations, subtle bugs), but these are secondary findings within a primarily adoption-focused study."
    481     },
    482     "drama_conflict": {
    483       "score": 2,
    484       "justification": "Microsoft employees studying a Microsoft product with a null productivity result creates inherent credibility tension; the 'Dear Diary' framing and verbatim quotes also give the study an unusually personal voice."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "GitHub Copilot is widely available, making study findings immediately testable and actionable for any developer."
    489     },
    490     "brand_recognition": {
    491       "score": 3,
    492       "justification": "GitHub Copilot is the most recognizable AI coding tool, and Microsoft Research involvement adds institutional weight; the paper's RCT claim for a flagship product drives interest."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "45751115",
    499         "title": "DeepSeek-OCR: Contexts Optical Compression",
    500         "points": 2,
    501         "comments": 0,
    502         "url": "https://news.ycombinator.com/item?id=45751115",
    503         "created_at": "2025-10-29T18:33:29Z"
    504       },
    505       {
    506         "hn_id": "28973605",
    507         "title": "Generalized Out-of-Distribution Detection: A Survey",
    508         "points": 2,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=28973605",
    511         "created_at": "2021-10-24T00:03:38Z"
    512       },
    513       {
    514         "hn_id": "42458574",
    515         "title": "Semantic, Orthographic, and Morphological Biases in Humans' Wordle Gameplay",
    516         "points": 1,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=42458574",
    519         "created_at": "2024-12-19T05:06:04Z"
    520       },
    521       {
    522         "hn_id": "28957390",
    523         "title": "Generalized Out-of-Distribution Detection: A Survey",
    524         "points": 1,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=28957390",
    527         "created_at": "2021-10-22T14:11:31Z"
    528       }
    529     ],
    530     "top_points": 2,
    531     "total_points": 6,
    532     "total_comments": 0
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs