scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28350B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HAI-Eval: Measuring Human-AI Synergy in Collaborative Coding",
      6     "authors": [
      7       "Hanjun Luo",
      8       "Chiming Ni",
      9       "Jiaheng Wen",
     10       "Zhimu Huang",
     11       "Yiran Wang",
     12       "Bingduo Liao",
     13       "Sylvia Chung",
     14       "Yingbin Jin",
     15       "Xinfeng Li",
     16       "Wenyuan Xu",
     17       "XiaoFeng Wang",
     18       "Hanan Salam"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2512.04111",
     23     "doi": "10.48550/arXiv.2512.04111"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Core quantitative claims (0.67% LLM pass rate, 18.89% human-only, 31.11% human-AI) are directly reported in Table 2; benchmark design claims (45 templates, 450 instances, 45 participants, 5 LLMs) are substantiated throughout the paper.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper claims C2 'statistically significantly improves' over CH but provides no test statistics, p-values, or identification of which statistical test was used despite stating 'Statistical comparisons use appropriate tests with averaged results.'",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Appendix A explicitly bounds scope to Python, Copilot-supported models as of July 26 2025, and East Asian university students, noting 'generalizability may be limited'; the main text repeats this caveat.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "LLM failure is attributed exclusively to 'higher-order reasoning' limitations; alternatives such as Copilot interface constraints, prompt-engineering gaps, or task-design artifacts are not considered.",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Binary pass/fail rates on dynamically generated benchmark tasks are conflated with the broader construct of 'human-AI synergy' and 'developer competencies in the AI era' without discussing what these metrics do and do not capture.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Appendix A is a dedicated 'Limitation & Future Work' section listing four specific limitations (Python-only, Copilot model constraint, demographic homogeneity, binary metric transparency loss).",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Specific threats include Python-only scope, Copilot constraint excluding o3/GPT-5/Deepseek/Llama/Qwen by name, East Asian student demographic limiting generalizability, and loss of interpretability from binary metric conversion.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Explicit scope boundaries stated: Python only, models available in Copilot Agent mode as of July 26 2025, East Asian CS students or recent graduates aged 19-26.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding acknowledgment or grant disclosure appears anywhere in the paper.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All 12 author affiliations are listed on the title page (NYU Abu Dhabi, NTU, UIUC, Harvard, Zhejiang University, UESTC, BJUT, HKPolyU).",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Funding source is not disclosed, so funder independence cannot be assessed; the benchmark evaluates commercial products (GitHub Copilot, Claude, GPT) but no industry funding or competing interests are declared.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement or financial interest declaration is present anywhere in the paper.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "'Collaboration-Necessary' is formally defined with mathematical constraints (Equation 1); 'Ecological Validity' is operationalized across three dimensions; 'higher-order reasoning' is grounded in Relational Complexity Theory (Halford et al., 1998).",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Three explicit contributions are enumerated: (1) Unified Benchmark, (2) Dual Interfaces for human and LLM evaluation, (3) Empirical Validation quantifying human-AI synergy.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 2 covers three distinct bodies of work (developer assessment platforms, LLM coding benchmarks, user studies in AI-assisted coding), situating HAI-Eval's contributions against each and identifying gaps.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The abstract uses future tense ('will be openly accessible') and no repository URL is provided; the paper is marked 'Work in progress.'",
    132           "source": "haiku"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Section 4.4 says the 450-instance dataset 'is released as a standalone resource' but no URL is provided, and the abstract's future-tense language and preprint status suggest public availability is not confirmed.",
    138           "source": "haiku"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "GitHub Codespaces with devcontainer files and Docker are mentioned, but no specific requirements.txt, Dockerfile, or versioned dependency manifest is included in the paper.",
    144           "source": "haiku"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The evaluation workflow is described conceptually (Section 4.4) but no step-by-step instructions for rerunning the full benchmark are provided in the paper itself.",
    150           "source": "haiku"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Tables 1, 2, 6, 7, 8 report only point estimates; no confidence intervals or error bars appear for any main performance result.",
    158           "source": "haiku"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper asserts 'statistically significant improvement' for C2 vs CH but provides no test statistics, p-values, or specification of which test was used.",
    164           "source": "haiku"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Absolute pass rate differences are reported with clear baselines (e.g., 18.89% CH → 31.11% C2, Δ=+12.22pp), constituting effect size reporting.",
    170           "source": "haiku"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The choice of 45 participants is motivated by Latin Square counterbalancing logistics, not a power analysis; no statistical justification for this sample size is provided.",
    176           "source": "haiku"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Standard deviations appear only for subjective Likert metrics (Table 9); all main performance tables (1, 2, 6-8) report no variance measures.",
    182           "source": "haiku"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "CH (human-only) and C0 (fully autonomous AI) serve as explicit baselines for the C2 (human-AI collaboration) condition.",
    190           "source": "haiku"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Models include Claude-Sonnet-4, GPT-4.1, GPT-4o, Claude-Sonnet-3.7, and Gemini-2.5-Pro — all SOTA Copilot-supported models as of the evaluation date (July 26, 2025).",
    196           "source": "haiku"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "C1 (minimally-intervened AI) is an explicit ablation of C2 isolating procedural execution failures from reasoning failures; C1 vs C0 isolates the impact of environmental assistance.",
    202           "source": "haiku"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Results reported using Overall Pass@1, Overall Pass@10, Partial Pass@1, Partial Pass@10, Completion Time, and Token Usage.",
    208           "source": "haiku"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "A within-subject user study with 45 expert participants evaluates task performance under human-only (CH) and human-AI collaboration (C2) conditions.",
    214           "source": "haiku"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Final scoring uses 'a comprehensive suite of hidden test cases executed on the backend,' distinct from visible unit tests provided during development.",
    220           "source": "haiku"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Results are broken down by difficulty level (Easy/Medium/Hard, Tables 2 and 6) and professional track (SDE/MLE/DS, Tables 7 and 8).",
    226           "source": "haiku"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Appendix L presents a case study of AI failure modes (distracted by legacy code, unable to extract column-merging logic from underspecified requirements).",
    232           "source": "haiku"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Near-zero LLM autonomous pass rates (0.67% best-case, 0% for GPT-4o) are the central finding, prominently reported across all tables.",
    238           "source": "haiku"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Models identified by marketing names (Claude-Sonnet-4, GPT-4.1, Gemini-2.5-Pro) with an evaluation date of July 26, 2025 but no specific API model IDs or snapshot version strings.",
    246           "source": "haiku"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "Appendix C shows task-generation agent prompts, and Appendix F shows example task READMEs described as 'short summaries of the original text' — the actual verbose task texts sent to LLMs are not provided.",
    252           "source": "haiku"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "GPT-4.1 task generation agent: temperature=0.7, top_p=0.9, max_tokens=8192 (Appendix C.1); LLM evaluation uses Copilot defaults, which cannot be customized — explicitly stated.",
    258           "source": "haiku"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "HAI-EC (VS Code extension) scaffolding pipeline is described in detail: environment build, Copilot invocation via VS Code API, README upload, iterative test-feedback loop, and environment cleanup.",
    264           "source": "haiku"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The template-to-instance generation pipeline (TechnicalParameterTool → ImplementationConstraintTool → contextual tools) and the two-stage quality validation protocol are documented in Sections 4.2 and Appendix D.",
    270           "source": "haiku"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "Individual participant scores, per-task results, or raw operational logs are not made available; only aggregated results are reported in the paper.",
    278           "source": "haiku"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Data collection via submission shell script, FastAPI backend endpoint, hidden test case execution, and operational log recording are all described in Section 4.3.",
    284           "source": "haiku"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Participants recruited via 'personal contacts and advertisements posted on university forums'; selection criteria, credential verification process, and track assignment criteria are detailed in Appendix H.1.",
    290           "source": "haiku"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "Pipeline from submission (shell script → HTTPS POST → FastAPI → evaluation scripts → JSON output) is documented in Section 4.3; metric aggregation from raw to derived metrics is described in Section 4.5.",
    296           "source": "haiku"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper provides an evaluation date (July 26, 2025) but does not state the training data cutoffs for any of the five evaluated LLMs.",
    304           "source": "haiku"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "Although dynamic instantiation from templates provides some protection, the paper never discusses whether underlying algorithmic problems or template structures could overlap with LLM training data.",
    310           "source": "haiku"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "The underlying algorithmic cores (palindromes, sliding windows, graph traversal, palindromic numbers) are standard CS problems well-represented in training corpora; this contamination risk is not addressed.",
    316           "source": "haiku"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No pre-registration is mentioned anywhere in the paper.",
    324           "source": "haiku"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "The informed consent form in Appendix J.1 explicitly states 'this study has been reviewed and approved by the Institutional Review Board (IRB).'",
    330           "source": "haiku"
    331         },
    332         "demographics_reported": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Appendix H.2 reports age range (19-26, mean 21.4), gender (28M/17F), education levels (undergrad/master/PhD), internship count (mean 1.47), and daily AI usage rate (84.4%).",
    336           "source": "haiku"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "Appendix H.1 lists eight specific eligibility criteria including age threshold, CS major, minimum two years programming experience, VS Code proficiency, and AI tool usage frequency.",
    342           "source": "haiku"
    343         },
    344         "randomization_described": {
    345           "applies": true,
    346           "answer": true,
    347           "justification": "Task sequences are randomly selected from all balanced permutations; a Latin Square design ensures every problem appears equally across conditions and is completed by different participants.",
    348           "source": "haiku"
    349         },
    350         "blinding_described": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "Participants necessarily know which condition they are in (Copilot enabled or not); no outcome assessor blinding or any blinding procedure is mentioned.",
    354           "source": "haiku"
    355         },
    356         "attrition_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No mention of dropouts, withdrawals, or attrition; it is not stated whether all 45 enrolled participants completed all four tasks.",
    360           "source": "haiku"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Token usage is reported in Tables 2 and 8 in millions (e.g., 2.04-2.31M tokens for C2 across tracks), providing a direct cost proxy for LLM inference.",
    368           "source": "haiku"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "The total computational budget for running 4,500+ LLM evaluation instances across 5 models and 2 conditions is not stated in terms of cost, GPU-hours, or API expenditure.",
    374           "source": "haiku"
    375         }
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "Current SOTA LLMs achieve near-zero pass rates (best 0.67% overall pass@1 for Claude-Sonnet-4 in C0) on collaboration-necessary coding tasks.",
    382       "evidence": "Table 1 shows all five models achieve <1% overall pass@1 in autonomous condition C0, with GPT-4o at 0% across all pass metrics.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Human-AI collaboration (31.11% pass rate) significantly outperforms both standalone LLMs (~0.67%) and unaided humans (18.89%).",
    387       "evidence": "Table 2 averaged overall pass@1: CH=18.89%, C0=0.67%, C2=31.11%; paper claims statistical significance but provides no test statistics.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Human-AI collaboration is especially beneficial on harder tasks, with collaborative performance remaining stable while unaided human performance degrades sharply.",
    392       "evidence": "Table 2 shows C2 range 23.33-43.33% vs CH range 6.67-36.67% across difficulty levels, but no significance tests reported for the interaction.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "LLMs have evolved from execution tools to co-reasoning partners; 51% of all participants and 80% of top performers reported AI suggested a fundamentally different algorithmic approach.",
    397       "evidence": "Table 10 reports self-reported usage patterns post-hoc; 23/45 total, 12/15 top performers selected 'Suggest a fundamentally different approach.'",
    398       "supported": "weak"
    399     },
    400     {
    401       "claim": "HAI-Eval templates are robustly collaboration-necessary: 42 of 45 templates achieved 0% pass rate across all three validation models.",
    402       "evidence": "Table 4 shows overall validation pass rates of 0.33%, 0%, and 0.22% for Claude-Sonnet-4, GPT-4.1, and Gemini-2.5-Pro respectively.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "observational",
    409     "qualitative"
    410   ],
    411   "key_findings": "HAI-Eval introduces 45 'collaboration-necessary' problem templates where five SOTA LLMs achieve near-zero autonomous pass rates (best 0.67%) while unaided human experts achieve 18.89%, but human-AI collaboration reaches 31.11% — demonstrating a measurable synergy gap. Uniform LLM failure across all difficulty levels and professional tracks (SDE/MLE/DS) is interpreted as a fundamental higher-order reasoning limitation, distinct from algorithmic or domain-specific deficiency. Self-report data from 45 participants indicates that 80% of top performers leveraged AI for strategic brainstorming and algorithmic approach selection, supporting an 'emergent co-reasoning partnership' framing, though this rests on post-hoc self-report rather than objective behavioral measurement.",
    412   "red_flags": [
    413     {
    414       "flag": "Statistical significance claimed without test statistics",
    415       "detail": "The paper asserts 'statistically significant improvement' for C2 vs CH but reports no p-values, test statistics, degrees of freedom, or identification of which statistical test was used."
    416     },
    417     {
    418       "flag": "No confidence intervals or error bars",
    419       "detail": "All main performance tables (1, 2, 6-8) report only point estimates; variance across participants, runs, or task instances is not reported for any performance metric."
    420     },
    421     {
    422       "flag": "Co-reasoning conclusion via self-report only",
    423       "detail": "The 'co-reasoning partnership' central claim relies on post-hoc questionnaire responses rather than objective behavioral metrics from operational logs, which the paper collected but does not analyze quantitatively."
    424     },
    425     {
    426       "flag": "Artifacts not yet released",
    427       "detail": "Code and dataset are described with future-tense ('will be openly accessible') and no repository URL; paper is marked 'Work in progress,' making reproducibility unverifiable."
    428     },
    429     {
    430       "flag": "Demographic homogeneity limits generalizability",
    431       "detail": "All 45 participants are East Asian CS students or recent graduates from elite institutions; no industry professionals, no non-East-Asian participants, severely limiting claims about developer populations broadly."
    432     },
    433     {
    434       "flag": "Contamination not addressed",
    435       "detail": "Underlying algorithmic cores (palindromes, sliding window, DFS with backtracking) are standard CS problems widely present in LLM training data; neither training cutoffs nor benchmark contamination risk is discussed."
    436     },
    437     {
    438       "flag": "Post-hoc top-performer subgroup analysis",
    439       "detail": "Comparison of top-15 performers vs all-45 (Table 10) is a post-hoc subgroup partition with no pre-specification or correction for multiple comparisons, making the 80% vs 51% contrast unreliable."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "SWE-bench: Can Language Models Solve Real-World Software Engineering Problems?",
    445       "relevance": "Major real-world coding benchmark that HAI-Eval contrasts with as an example of well-defined task framing that excludes higher-order reasoning"
    446     },
    447     {
    448       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    449       "relevance": "Foundational function-level code generation benchmark used as a baseline example of what HAI-Eval moves beyond"
    450     },
    451     {
    452       "title": "LiveCodeBench: A Comprehensive Benchmark for General-Purpose Language Agents",
    453       "relevance": "Cited as focusing on extreme algorithmic complexity without measuring higher-order skills like requirement engineering"
    454     },
    455     {
    456       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    457       "relevance": "Academic user study of Copilot interaction patterns directly related to HAI-Eval's human study design"
    458     },
    459     {
    460       "title": "Reading Between the Lines: Modeling User Behavior and Costs in AI-Assisted Programming",
    461       "relevance": "Closest prior work on modeling human cognitive costs in AI-assisted programming, motivating HAI-Eval's 'necessary collaboration' design principle"
    462     },
    463     {
    464       "title": "Collaborative Gym: A Framework for Enabling and Evaluating Human-Agent Collaboration",
    465       "relevance": "Most closely related framework for human-agent collaboration evaluation; HAI-Eval applies this paradigm specifically to coding"
    466     },
    467     {
    468       "title": "When Combinations of Humans and AI Are Useful: A Systematic Review and Meta-Analysis",
    469       "relevance": "Meta-analysis providing theoretical grounding for HAI-Eval's complementarity hypothesis"
    470     },
    471     {
    472       "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
    473       "relevance": "RCT on AI coding productivity representing the enterprise evidence base HAI-Eval situates against as lacking standardized benchmark reproducibility"
    474     },
    475     {
    476       "title": "Sea Change in Software Development: Economic and Productivity Analysis of the AI-Powered Developer Lifecycle",
    477       "relevance": "Enterprise AI productivity study reporting 15-30% gains; represents the class of productivity-focused studies HAI-Eval aims to methodologically complement"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "Directly addresses how to evaluate developer competence and AI tool effectiveness in the AI era, with clear implications for hiring, education, and tool benchmarking."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "Near-zero SOTA LLM performance on tasks solvable with human collaboration is striking; the co-reasoning framing challenges the simple tool-use model of AI-assisted coding."
    488     },
    489     "fear_safety": {
    490       "score": 0,
    491       "justification": "No AI safety or risk concerns raised; focus is purely on capability evaluation and benchmark design."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Implicitly challenges the LeetCode/HackerRank hiring paradigm and the 'AI replaces developers' narrative, but no explicit controversy or adversarial framing."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "An interactive demo is promised and the VS Code/Copilot setup is familiar to practitioners; however availability is future-tense at paper submission time."
    500     },
    501     "brand_recognition": {
    502       "score": 1,
    503       "justification": "Multi-institution academic paper from NYU Abu Dhabi, UIUC, Harvard, NTU — notable universities but not a major AI lab; no industry co-authors."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "25314295",
    510         "title": "Neural Teleportation",
    511         "points": 3,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=25314295",
    514         "created_at": "2020-12-05T13:14:38Z"
    515       },
    516       {
    517         "hn_id": "41124882",
    518         "title": "Luck, skill, and depth of competition in games and social hierarchies",
    519         "points": 2,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=41124882",
    522         "created_at": "2024-08-01T00:08:49Z"
    523       },
    524       {
    525         "hn_id": "46208288",
    526         "title": "Autodeleveraging: Impossibilities and Optimization",
    527         "points": 1,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=46208288",
    530         "created_at": "2025-12-09T18:07:46Z"
    531       },
    532       {
    533         "hn_id": "42348424",
    534         "title": "Enhancing Mathematical Reasoning in LLMs with Background Operators",
    535         "points": 1,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=42348424",
    538         "created_at": "2024-12-07T09:23:20Z"
    539       }
    540     ],
    541     "top_points": 3,
    542     "total_points": 7,
    543     "total_comments": 0
    544   }
    545 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs