scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (28906B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace",
      6     "authors": [
      7       "Jenna Butler",
      8       "Jina Suh",
      9       "Sankeerti Haniyur",
     10       "Constance Hadley"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv",
     14     "arxiv_id": "2410.18334",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about increased usefulness/enjoyment perception (supported by paired t-tests), unchanged trustworthiness (supported by non-significant changes), unexpected uses (supported by diary coding), and 84%/66% positive change rates (supported by qualitative coding) are all backed by results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The RCT design supports causal claims, but 25% non-compliance in both groups (control used GenAI, treatment didn't use it) and high attrition (228→106) undermine causal inference. No intention-to-treat analysis is performed; only compliant participants are analyzed, introducing selection bias.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 5 (Limitations) explicitly bounds generalization to a single company, acknowledges the population characteristics, and discusses the single-company limitation with reference to Flyvbjerg's case study methodology.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper discusses several alternative explanations: the 11-week tipping point for proficiency, confounding factors in telemetry (sprint planning, oncall, vacation), non-compliance, and the hypothesis that AI news cycle (not tool use) drove increased self-confidence in unique skills.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures Likert scale beliefs and telemetry proxies (lines of code, PRs) but does not explicitly discuss the gap between these proxies and actual productivity or developer experience. Telemetry is presented as measuring 'impact' without discussing what these proxies actually capture.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 is a dedicated 'Limitations' section discussing self-report bias, unvalidated survey instruments, and single-company generalizability.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The limitations section identifies specific threats: self-reported data introducing social desirability and recall bias, unvalidated survey instruments (novel questions with no prior validation), and single-company population.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states it was conducted at a single company with specific demographics, acknowledges the survey questions were not previously validated, and discusses what the study timeframe may not capture (citing the 11-week tipping point).",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding statement is provided. Three of four authors are Microsoft employees, but there is no explicit funding disclosure.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed: three authors at Microsoft (Redmond, WA) and one at Institute for Work Life (Boston, MA).",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Three authors are Microsoft employees evaluating GitHub Copilot, a Microsoft product. Microsoft has a direct financial interest in positive Copilot findings. This is not disclosed as a conflict.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present. Microsoft employees evaluating a Microsoft product without any COI declaration.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'Generative AI coding tools' is operationalized through GitHub Copilot as a concrete example; 'beliefs' are defined via specific Likert statements listed in the methodology; experience levels are defined by explicit survey response categories.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly claims to be 'one of the first randomized controlled trials of GitHub Copilot in a real-world work environment' examining both quantitative coding data and effects on developer beliefs.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 situates the work within TAM/adoption theory and prior Copilot studies, explicitly contrasting real-world RCT design against prior lab experiments and survey work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No source code or analysis scripts are released. The paper references supplemental survey materials on Zenodo [5] but no analysis code.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No raw data (survey responses, diary entries, telemetry) is released. Only supplemental survey questions are shared via Zenodo.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No environment specifications or software dependencies are provided for reproducing the analysis.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No reproduction instructions are provided. The supplemental material contains only survey instruments, not analysis procedures.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates with p-values only (e.g., 'average rating rose from 2.72 to 3.61').",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Paired t-tests are used for before/after Likert comparisons (Section 4.2), chi-square for randomization balance, Kruskal-Wallis for diary distributions, and difference-in-differences for telemetry (Section 4.2.1).",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No standardized effect sizes (Cohen's d, etc.) are reported. Raw mean differences are given (e.g., 2.72 to 3.61) and correlation coefficients (r=0.691 vs 0.606), but no formal effect size measures for the main treatment comparisons.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or sample size justification is provided. The paper acknowledges low power post-hoc in Table 2 (power=0.06 for CodeChanges) but did not plan the sample size in advance.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Standard deviation is reported only for diary submission counts (SD=4.819). No variance measures are reported for the main outcome variables (Likert scales, telemetry metrics).",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The RCT includes a control group (no Copilot) and a continuing group (already using Copilot), providing baseline comparisons for the treatment group.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The control condition (no GenAI tools) is the appropriate contemporary baseline for evaluating the introduction of a new tool.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Not applicable — this is an RCT evaluating a single tool (GitHub Copilot), not a multi-component system.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics are used: Likert belief scales (usefulness, trust, enjoyment), telemetry (code changes, PRs, development minutes, email minutes, build minutes), and qualitative diary coding.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "The study centers on human evaluation — surveys, diary entries, and qualitative coding of developer responses about their experience with the tools.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Not applicable — this is an RCT with human participants, not a benchmark evaluation with train/test splits.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by prior experience (experienced vs. inexperienced users), by group (treatment/control/continuing), and by demographic categories in Table 1.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 4.3.2 'Challenges' discusses failure cases extensively: incorrect but plausible code, validation overhead negating productivity, lack of language support, with specific diary verbatims.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports null telemetry results (Table 2: no statistically significant DiD results for any metric), trust not changing, and 16% negative responses about work changes.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The paper refers to 'Github Copilot' throughout without specifying which version, model backend, or snapshot was used. Given Copilot changed significantly over 2022-2023, this is a meaningful omission.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "The paper evaluates GitHub Copilot as a black-box tool used by developers naturally — there are no researcher-designed prompts to report.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "Not applicable — Copilot is used as a black-box developer tool with default settings; there are no researcher-controlled hyperparameters.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Copilot is evaluated as a third-party black-box tool. The authors cannot describe internal scaffolding.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "The paper describes high-level filtering (228→106) but does not document the open coding procedure for qualitative data, inter-rater reliability for coding, or how telemetry was preprocessed for the DiD analysis.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw survey responses, diary entries, and telemetry data are not released. Only supplemental survey instruments are available on Zenodo.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Data collection is described in detail: intake surveys (Section 3.4), daily diary via Teams messages (Section 3.6), closing survey (Section 3.7), and telemetry collection with consent (Section 3.8).",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 3.3 describes recruitment: randomly chosen from 10,000 engineers, 337 completed survey, 269 agreed, 228 after country filtering, 106 final compliant population.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline is documented: 10,000 randomly chosen → 337 completed survey → 269 consented → 228 after country exclusion → 106 final (compliant + completed diary + completed exit survey). Open coding is mentioned for qualitative analysis.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This is an RCT studying developer behavior and beliefs, not evaluating a model's capability on a benchmark.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not applicable — this study does not evaluate model performance on benchmark tasks.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Not applicable — no benchmark evaluation is performed.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "The Acknowledgments state: 'The ethics for this study were reviewed and approved by the Microsoft Research Institutional Review Board (MSRIRB), which is an IRB federally registered with the United States Department of Health & Human Services.'",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Section 3.3 reports gender, management level, seniority (junior/senior/principal), primary programming language, and Table 1 breaks down demographics by randomization group.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Inclusion criteria: software engineers at the company, in allowed countries. Exclusion: participants not in allowed countries were removed. Final population required completing ≥1 diary, exit survey, and treatment compliance.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": true,
    339           "justification": "Section 3.5 describes block randomization based on gender, initial perception of AI tools ('I like AI coding tools' and 'I trust AI coding tools'), with chi-square verification of balance.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": true,
    344           "answer": false,
    345           "justification": "No mention of blinding. Participants knew which group they were in (treatment received Copilot access, control was told not to use GenAI). No discussion of whether this knowledge affected responses.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": true,
    351           "justification": "Attrition is documented: 228 intake → 106 final population (53% attrition). 25% non-compliance in both groups is also reported. However, reasons for dropout beyond non-compliance are not detailed.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "This is an RCT studying developer beliefs, not proposing a method with inference costs.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "This is a human subjects study, not a computational experiment requiring compute budget reporting.",
    366           "source": "opus"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Regular use of GitHub Copilot significantly increased developers' belief that AI coding tools are useful (p=0.001) and enjoyable (p<0.0001).",
    374       "evidence": "Paired t-tests on Likert scores: usefulness 2.93→3.51 (p=0.001), liking 2.72→3.61 (p<0.0001) for treatment group.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "GitHub Copilot use did not significantly change developer trust in AI-generated code.",
    379       "evidence": "No statistically significant change in trust or reliability Likert items; approximately 20% trusted AI code both before and after.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "GitHub Copilot had no statistically significant effect on objective telemetry metrics (code changes, PRs, development minutes).",
    384       "evidence": "Table 2 DiD results: all p-values >0.5 for code changes, PRs, development/email/build minutes; study authors attribute this to underpowering and short duration.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "84% of participants reported positive changes in their daily work practices after Copilot use.",
    389       "evidence": "Open coding of 99 verbatim responses to closing survey: 84% of coded responses described positive changes; based on self-report only.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Developers with prior Copilot experience were significantly more likely to believe the tools are useful (86% vs 44%) and to like them (72% vs 43%).",
    394       "evidence": "Chi-square comparison between experience groups on intake survey; p<0.05 for both usefulness and liking.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "An unexpected use case emerged: developers frequently used Copilot as a replacement for web search.",
    399       "evidence": "Qualitative diary data documents multiple participants explicitly reporting using Copilot instead of Google/Stack Overflow; not anticipated in intake surveys.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "66% of developers reported changes in how they feel about their work after using Copilot.",
    404       "evidence": "Open coding of closing survey verbatims: 62% said no change, leaving 38% who reported changed feelings — actual reported figure is 66% who noticed any change in feelings, from a separate count.",
    405       "supported": "weak"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "rct",
    410     "qualitative"
    411   ],
    412   "key_findings": "A 3-week workplace RCT of GitHub Copilot (n=106 completers) found statistically significant increases in developers' beliefs that AI tools are useful and enjoyable, but no significant change in trust in AI-generated code. Telemetry metrics (code changes, PRs, development time) showed no statistically significant differences between treatment and control, likely due to underpowering and duration too short for proficiency to develop. Qualitative diary data revealed unexpected use cases (Copilot as web search replacement, creative ideation aid) alongside persistent challenges (hallucinations, language coverage gaps, validation overhead). The study is notable as one of the first real-world RCTs of GitHub Copilot, but is conducted entirely by Microsoft employees evaluating a Microsoft product without pre-registration.",
    413   "red_flags": [
    414     {
    415       "flag": "Severe COI: Microsoft evaluating own product",
    416       "detail": "Three of four authors are Microsoft employees studying GitHub Copilot, a Microsoft product; no competing interests statement is present and positive results directly benefit the researchers' employer."
    417     },
    418     {
    419       "flag": "54% attrition from intake to final population",
    420       "detail": "228 intake participants reduced to 106 completers (54% attrition); compliance issues (25% in each arm violated treatment assignment) further reduce effective sample, and the paper does not analyze whether completers differ systematically from dropouts."
    421     },
    422     {
    423       "flag": "Unvalidated survey instrument",
    424       "detail": "Authors acknowledge survey questions were 'original and not previously validated'; Likert items measuring 'beliefs' have unknown psychometric properties, limiting interpretation of pre/post comparisons."
    425     },
    426     {
    427       "flag": "No pre-registration",
    428       "detail": "An RCT of this type should be pre-registered; the absence of pre-registration allows post-hoc selection among outcomes (the paper reports significance only for belief items while null telemetry results are relegated to one table)."
    429     },
    430     {
    431       "flag": "Paired t-tests without SDs or CIs",
    432       "detail": "Main belief change results report only means and p-values with no standard deviations, confidence intervals, or effect size measures, making it impossible to assess practical significance."
    433     },
    434     {
    435       "flag": "Study underpowered for primary telemetry claims",
    436       "detail": "Post-hoc power for CodeChanges is 0.06 (near chance), meaning the null telemetry result cannot be interpreted as evidence of no effect; the paper buries this in discussion."
    437     },
    438     {
    439       "flag": "Open coding by study authors without inter-rater reliability",
    440       "detail": "Qualitative diary and survey coding was performed by the research team with no reported inter-rater reliability statistics or independent coder verification."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    446       "relevance": "Foundational productivity RCT in lab setting (55.8% time reduction) that this study explicitly contrasts against in real-world setting."
    447     },
    448     {
    449       "title": "The Impact of AI Tool on Engineering at ANZ Bank: An Empirical Study on GitHub Copilot within Corporate Environment",
    450       "relevance": "Another real-world Copilot productivity study finding 42.36% boost; used as comparison for belief change findings."
    451     },
    452     {
    453       "title": "Is GitHub Copilot a Substitute for Human Pair-Programming? An Empirical Study",
    454       "relevance": "Prior work on code quality trade-offs of Copilot vs human pair programming."
    455     },
    456     {
    457       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    458       "relevance": "Security vulnerability study (~40% of generated programs) cited as prior evidence for developer concerns about Copilot output quality."
    459     },
    460     {
    461       "title": "Taking Flight with Copilot: Early Insights and Opportunities of AI-Powered Pair-Programming Tools",
    462       "relevance": "Prediction that developer role will shift toward code review rather than writing, which this study tests empirically."
    463     },
    464     {
    465       "title": "Using AI-Based Coding Assistants in Practice: State of Affairs, Perceptions, and Ways Forward",
    466       "relevance": "Survey of developer perceptions and use patterns for GenAI coding assistants, directly related to belief study."
    467     },
    468     {
    469       "title": "Generative AI in Real-World Workplaces",
    470       "relevance": "Microsoft internal report on 11-week tipping point for Copilot productivity effects; cited to explain null 3-week telemetry results."
    471     },
    472     {
    473       "title": "Practices and Challenges of Using GitHub Copilot: An Empirical Study",
    474       "relevance": "Stack Overflow/GitHub discussion analysis of Copilot use patterns, language/IDE distribution, benefits and challenges."
    475     },
    476     {
    477       "title": "Chatting with AI: Deciphering Developer Conversations with ChatGPT",
    478       "relevance": "Analysis of how developers use LLM tools for both traditional and non-traditional software tasks."
    479     },
    480     {
    481       "title": "Transforming Software Development: Evaluating the Efficiency and Challenges of GitHub Copilot in Real-World Projects",
    482       "relevance": "Real-world Copilot study finding 50% time savings in documentation/autocompletion but struggles with complex tasks."
    483     }
    484   ],
    485   "engagement_factors": {
    486     "practical_relevance": {
    487       "score": 1,
    488       "justification": "Findings about Copilot adoption barriers and use cases are interesting but don't give practitioners a new technique or tool to apply."
    489     },
    490     "surprise_contrarian": {
    491       "score": 2,
    492       "justification": "The null telemetry result — no measurable productivity gain despite self-reported enthusiasm — directly undermines the widely-cited '55% faster' claim."
    493     },
    494     "fear_safety": {
    495       "score": 0,
    496       "justification": "No safety, security, or risk angle beyond brief mentions of AI-generated code bugs."
    497     },
    498     "drama_conflict": {
    499       "score": 2,
    500       "justification": "Microsoft employees finding no objective productivity gain from their own product, while the company markets it as transformative, creates an uncomfortable tension."
    501     },
    502     "demo_ability": {
    503       "score": 0,
    504       "justification": "This is a workplace study with no code, tool, or demo to try."
    505     },
    506     "brand_recognition": {
    507       "score": 3,
    508       "justification": "Directly about GitHub Copilot (millions of users) conducted at Microsoft, two of the most recognized names in developer tools."
    509     }
    510   },
    511   "hn_data": {
    512     "threads": [
    513       {
    514         "hn_id": "45751115",
    515         "title": "DeepSeek-OCR: Contexts Optical Compression",
    516         "points": 2,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=45751115",
    519         "created_at": "2025-10-29T18:33:29Z"
    520       },
    521       {
    522         "hn_id": "28973605",
    523         "title": "Generalized Out-of-Distribution Detection: A Survey",
    524         "points": 2,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=28973605",
    527         "created_at": "2021-10-24T00:03:38Z"
    528       },
    529       {
    530         "hn_id": "42458574",
    531         "title": "Semantic, Orthographic, and Morphological Biases in Humans' Wordle Gameplay",
    532         "points": 1,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=42458574",
    535         "created_at": "2024-12-19T05:06:04Z"
    536       },
    537       {
    538         "hn_id": "28957390",
    539         "title": "Generalized Out-of-Distribution Detection: A Survey",
    540         "points": 1,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=28957390",
    543         "created_at": "2021-10-22T14:11:31Z"
    544       }
    545     ],
    546     "top_points": 2,
    547     "total_points": 6,
    548     "total_comments": 0
    549   }
    550 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs