scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22442B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "HumanEvalComm: Benchmarking the Communication Competence of Code Generation for LLMs and LLM Agents",
      6     "authors": [
      7       "J. Wu",
      8       "Fatemeh H. Fard"
      9     ],
     10     "year": 2024,
     11     "venue": "ACM Transactions on Software Engineering and Methodology",
     12     "arxiv_id": "2406.00215",
     13     "doi": "10.1145/3715109"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All quantitative claims in the abstract (>60% generate code, 35-52% Pass@1 drop, absolute 58%/38% improvement for Okanagan) are directly traceable to Tables 3-5 with statistical significance reported.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper claims Okanagan 'boosts' pass rates by asking clarifying questions, but the False Recovery Rate problem (30-40% for some models) means the evaluator sometimes provides missing info even without questions being asked, confounding the causal mechanism.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The benchmark is built exclusively on HumanEval (164 Python problems, algorithmic tasks) yet the title and framing claim general communication competence evaluation; the external validity section acknowledges non-generalizability only briefly.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper discusses alternative explanations for anomalous results (DeepSeek Chat improvement attributed to formatting failures in baseline, LLM evaluator inflation affecting Good Question Rate) and acknowledges the generative nature of LLMs as explaining low communication rates.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper distinguishes between Communication Rate (proxy for asking behavior), Good Question Rate (proxy for question quality), and Pass@1/Test Pass Rate (functional correctness), explicitly discussing how each measures a different aspect and where they diverge from ground truth.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5.5 'Limitations' and Section 7 'Threats to Validity' both exist as dedicated sections addressing separate concerns about the benchmark and evaluation.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are named: manual modification subjectivity, parameter choices for open-source models (max_new_tokens=512), LLM evaluator tendency to over-label 'Good' questions, and False Recovery Rate inflating pass rates for specific models (CodeQwen1.5, DeepSeek Coder).",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states results cannot be claimed for other datasets without further testing, and the benchmark is restricted to Python/HumanEval-style algorithmic problems with three specific clarification types.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Funding is disclosed in acknowledgments: 'Natural Sciences and Engineering Research Council of Canada RGPIN-2019-05175.'",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors list University of British Columbia, Kelowna affiliation; no evaluated product is affiliated with the authors.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "NSERC is a Canadian government research council with no stake in the benchmark results or the models evaluated.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests declaration is present; the acknowledgment only mentions the grant and student helpers.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Communication capability is formally defined as how well a model asks clarifying questions when requirements are incomplete/inconsistent/ambiguous; Communication Rate and Good Question Rate are precisely defined with formulas; the three clarification types have explicit definitions drawn from Requirements Engineering literature.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit contributions are enumerated: (1) HumanEvalComm benchmark, (2) Okanagan LLM agent approach, (3) first empirical study on communication competence with LLM-based evaluator and new metrics.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 8 covers code generation LLMs, self-correction, and LLM agents for code generation, situating the work as the 'first systematic empirical study' on communication skills and distinguishing it from prior task-solving agent work.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "benchmark-creation": {
    117       "construct_design": {
    118         "construct_validity_argued": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "The paper argues that communication competence in software engineering requires asking clarifying questions under ambiguous/incomplete/inconsistent requirements, grounded in Requirements Engineering literature, and the benchmark is designed to operationalize this by creating problems that require clarification to solve correctly.",
    122           "source": "haiku"
    123         },
    124         "difficulty_distribution_characterized": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Difficulty is implicitly ordered (2-type combinations harder than 1-type, Incompleteness harder than Inconsistency for pass rates) but no formal difficulty characterization, tiers, or difficulty metrics are reported; easy/medium/hard is not quantified.",
    128           "source": "haiku"
    129         },
    130         "ceiling_floor_effects_checked": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "CodeQwen1.5 Chat achieves 0% Communication Rate for 1p (floor effect in communication) and some models approach very low pass rates; the paper notes results but does not explicitly check for or discuss ceiling/floor effects as a benchmark design concern.",
    134           "source": "haiku"
    135         },
    136         "human_baseline_included": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No human baseline for the main communication task is included; the 6 graduate students are used only to validate the LLM-based evaluator, not as a performance baseline for asking clarifying questions.",
    140           "source": "haiku"
    141         },
    142         "scoring_rubric_justified": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Communication Rate (code vs. non-code binary) and Good Question Rate (Good/Fair/Bad via LLM evaluator) are defined and their limitations discussed in detail in RQ3, including comparison with human judgment to validate the rubric.",
    146           "source": "haiku"
    147         }
    148       },
    149       "robustness": {
    150         "contamination_resistance_designed": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No contamination resistance is designed in; the benchmark is derived from HumanEval which is widely used in training, and the paper does not analyze whether models have seen original HumanEval problems during pre-training.",
    154           "source": "haiku"
    155         },
    156         "temporal_robustness_discussed": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Temporal robustness is not discussed; the paper does not address how the benchmark will remain useful as models improve or become capable of consistently asking clarifying questions.",
    160           "source": "haiku"
    161         },
    162         "failure_modes_discussed": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Section 5.3 and 5.5 discuss key failure modes: LLM evaluator over-labeling Good questions, False Recovery Rate inflating pass rates for non-communicating models, and Okanagan asking unnecessary questions for clear problems.",
    166           "source": "haiku"
    167         },
    168         "baseline_implementations_provided": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Full replication package including benchmark data, evaluation code, and Okanagan implementation is publicly released at https://github.com/jie-jw-wu/human-eval-comm.",
    172           "source": "haiku"
    173         }
    174       },
    175       "documentation": {
    176         "dataset_documentation_complete": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "The construction methodology is described (Section 2, ~100 hours manual modification, 2 annotators), statistics are given (Table 1), but no formal data card, preprocessing steps, or storage format documentation is provided.",
    180           "source": "haiku"
    181         },
    182         "licensing_and_access_clear": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "The benchmark is stated to be publicly available on GitHub but no specific license (e.g., MIT, Apache, CC-BY) is mentioned in the paper, creating ambiguity about permitted use.",
    186           "source": "haiku"
    187         },
    188         "intended_use_specified": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Section 5.5 notes other researchers can use HumanEvalComm directly without replication effort; Section 6 specifies it should evaluate communication competence in code generation; limitations on generalizability to other datasets are stated.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "More than 60% of Code LLM responses generate code rather than ask clarifying questions when problem descriptions have clarification issues.",
    200       "evidence": "Table 3 shows Communication Rates of 14.21% (ChatGPT), 10.16% (CodeLlama), 4.82% (CodeQwen1.5), 30.76% (DeepSeek Coder), 37.93% (DeepSeek Chat) — all well below 50%, meaning >62-95% generate code instead.",
    201       "supported": "strong"
    202     },
    203     {
    204       "claim": "Pass@1 and Test Pass Rate of Code LLMs drop by 35-52% and 17-35% respectively on modified problems.",
    205       "evidence": "Table 3 shows, e.g., ChatGPT Pass@1: 65.58% → 31.34% (52% relative drop); CodeQwen1.5 Chat: 76.83% → 47.61% (38% drop). Statistical significance reported for over 75% of comparisons.",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Okanagan increases Communication Rate and Good Question Rate by absolute 58% and 38% compared to ChatGPT 3.5.",
    210       "evidence": "Table 3: ChatGPT Communication Rate 14.21% vs Okanagan 72.73% (+58.52%); Good Question Rate 13.43% vs 52.24% (+38.81%).",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Okanagan boosts Pass@1 and Test Pass Rate by absolute 8% and 7% over ChatGPT on modified problems.",
    215       "evidence": "Table 3: Okanagan Pass@1 39.62% vs ChatGPT 31.34% (+8.28%); Test Pass Rate 56.98% vs 49.39% (+7.59%).",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "Incompleteness triggers higher communication rates but results in lower pass rates than Ambiguity and Inconsistency.",
    220       "evidence": "Table 4 shows for ChatGPT: Communication Rate 1p=31.68% vs 1a=5.84% vs 1c=5.84%; but Test Pass Rate 1p=44.14% < 1a=54.98% < 1c=66.37%.",
    221       "supported": "strong"
    222     },
    223     {
    224       "claim": "Even with 90% of problem description randomly removed, 46% of LLM responses still generate code without asking questions.",
    225       "evidence": "Table 7 shows Communication Rate at 90% removal = 54.1%, meaning 45.9% still generate code; this is using ChatGPT 3.5.",
    226       "supported": "moderate"
    227     },
    228     {
    229       "claim": "The LLM-based evaluator has high False Recovery Rates (30-40%) for some models, potentially inflating pass rates.",
    230       "evidence": "Table 6 shows False Recovery Rate for CodeLlama=32.5%, CodeQwen1.5=40.7%, DeepSeek Coder=38.0%, DeepSeek Chat=34.8%, while Okanagan=0%.",
    231       "supported": "strong"
    232     }
    233   ],
    234   "methodology_tags": [
    235     "benchmark-eval",
    236     "case-study"
    237   ],
    238   "key_findings": "Current Code LLMs have severely limited communication capabilities: over 60% of responses generate code even when problem descriptions are demonstrably incomplete, inconsistent, or ambiguous, and performance drops 35-52% on Pass@1. The Okanagan LLM agent (multi-round with clarification step) increases communication rates by ~58 percentage points and improves pass rates, but introduces a new failure mode of asking unnecessary questions on clear problems. The LLM-based evaluator approach is reasonable but exhibits systematic biases including over-labeling questions as 'Good' and high False Recovery Rates for non-communicating models, complicating metric reliability.",
    239   "red_flags": [
    240     {
    241       "flag": "LLM evaluator False Recovery Rate",
    242       "detail": "The LLM-based evaluator provides answers that recover missing information even when models ask no questions, with False Recovery Rates of 32-41% for CodeLlama, CodeQwen1.5, and DeepSeek Coder. This artificially inflates Pass@1 and Test Pass Rate for non-communicating models, making comparisons unreliable."
    243     },
    244     {
    245       "flag": "No contamination analysis",
    246       "detail": "HumanEvalComm is derived from HumanEval, one of the most widely used code generation benchmarks in LLM training corpora. The paper does not analyze whether models have memorized HumanEval problems, which could affect both baseline and modified problem performance."
    247     },
    248     {
    249       "flag": "No human baseline for main task",
    250       "detail": "There is no human performance baseline on HumanEvalComm itself — human participants are only used to validate the evaluator quality, not to anchor what good performance looks like on the communication task."
    251     },
    252     {
    253       "flag": "Single-domain benchmark overgeneralization",
    254       "detail": "All 762 modified problems are Python algorithmic tasks from HumanEval; the paper title and framing imply broad 'communication competence' evaluation, but the external validity section acknowledges results may not generalize to other programming languages, problem types, or real-world software engineering tasks."
    255     },
    256     {
    257       "flag": "Okanagan only tested with ChatGPT 3.5",
    258       "detail": "The primary Okanagan results use ChatGPT 3.5 as the base LLM; the DeepSeek Coder variant underperforms, and the paper attributes this to prompt sensitivity without fully investigating it, limiting generalizability of the agent approach."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Evaluating large language models trained on code (HumanEval)",
    264       "relevance": "Foundation benchmark that HumanEvalComm is built upon; defines Pass@k metric used throughout"
    265     },
    266     {
    267       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    268       "relevance": "Related code generation benchmark representing real-world software engineering tasks; used as contrast to show current benchmarks don't evaluate communication"
    269     },
    270     {
    271       "title": "Large language models for software engineering: Survey and open problems",
    272       "relevance": "Survey of LLM applications in SE identifying communication gaps as open problem"
    273     },
    274     {
    275       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    276       "relevance": "LLM agent baseline evaluated on HumanEvalComm; shows 0% communication rate when adapted to the task"
    277     },
    278     {
    279       "title": "SWE-Agent: Agent-computer interfaces enable automated software engineering",
    280       "relevance": "Related LLM agent for software engineering used as context for agent-based approaches"
    281     },
    282     {
    283       "title": "Exploring collaboration mechanisms for LLM agents: A social psychology view",
    284       "relevance": "Framework that inspires Okanagan's multi-round debate/reflection structure"
    285     },
    286     {
    287       "title": "Beyond accuracy: Evaluating self-consistency of code large language models with IdentityChain",
    288       "relevance": "Provides evaluation methodology reused for open-source model testing in the study"
    289     },
    290     {
    291       "title": "The rise and potential of large language model based agents: A survey",
    292       "relevance": "Survey of LLM agent types and capabilities, providing background for the agent evaluation framework"
    293     },
    294     {
    295       "title": "LLM is Like a Box of Chocolates: the Non-determinism of ChatGPT in Code Generation",
    296       "relevance": "Provides metrics (Mean, Variance, Max Diff, Ratio of Worst) reused for the incomplete modification experiments in RQ4"
    297     },
    298     {
    299       "title": "Rethinking Software Engineering in the Foundation Model Era",
    300       "relevance": "Argues for goal-driven AI pair programmers vs task-driven copilots, directly motivating the communication competence framing"
    301     }
    302   ],
    303   "engagement_factors": {
    304     "practical_relevance": {
    305       "score": 2,
    306       "justification": "Practitioners using LLM coding assistants can directly apply the finding that models almost never ask clarifying questions, and Okanagan is available on GitHub as a drop-in improvement."
    307     },
    308     "surprise_contrarian": {
    309       "score": 2,
    310       "justification": "The finding that LLMs continue generating code even with 90% of the description removed challenges assumptions about LLM comprehension capabilities and prompts a reassessment of how 'capable' current models actually are."
    311     },
    312     "fear_safety": {
    313       "score": 0,
    314       "justification": "No safety or AI risk concerns raised; the paper focuses on software quality rather than harmful behaviors."
    315     },
    316     "drama_conflict": {
    317       "score": 1,
    318       "justification": "Mild tension between the narrative of LLMs as capable coders vs. the evidence they can't ask basic clarifying questions, but no direct controversy with other researchers."
    319     },
    320     "demo_ability": {
    321       "score": 2,
    322       "justification": "Full benchmark and Okanagan code are publicly available on GitHub; anyone can run their own model against HumanEvalComm immediately."
    323     },
    324     "brand_recognition": {
    325       "score": 1,
    326       "justification": "University of British Columbia is a respected institution but not a famous AI lab; no big-tech affiliation or celebrity researchers."
    327     }
    328   },
    329   "hn_data": {
    330     "threads": [
    331       {
    332         "hn_id": "45291024",
    333         "title": "Launch HN: Cactus (YC S25) – AI inference on smartphones",
    334         "points": 123,
    335         "comments": 63,
    336         "url": "https://news.ycombinator.com/item?id=45291024",
    337         "created_at": "2025-09-18T15:40:29Z"
    338       },
    339       {
    340         "hn_id": "44430311",
    341         "title": "Small language models are the future of agentic AI",
    342         "points": 113,
    343         "comments": 45,
    344         "url": "https://news.ycombinator.com/item?id=44430311",
    345         "created_at": "2025-07-01T03:33:49Z"
    346       },
    347       {
    348         "hn_id": "36165862",
    349         "title": "The feasibility of artificial consciousness through the lens of neuroscience",
    350         "points": 5,
    351         "comments": 1,
    352         "url": "https://news.ycombinator.com/item?id=36165862",
    353         "created_at": "2023-06-02T14:49:24Z"
    354       },
    355       {
    356         "hn_id": "44246361",
    357         "title": "Small Language Models Are the Future of Agentic AI",
    358         "points": 5,
    359         "comments": 0,
    360         "url": "https://news.ycombinator.com/item?id=44246361",
    361         "created_at": "2025-06-11T11:16:33Z"
    362       },
    363       {
    364         "hn_id": "41137040",
    365         "title": "Positive Mass in General Relativity Without Energy Conditions",
    366         "points": 3,
    367         "comments": 2,
    368         "url": "https://news.ycombinator.com/item?id=41137040",
    369         "created_at": "2024-08-02T08:19:18Z"
    370       },
    371       {
    372         "hn_id": "39941576",
    373         "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
    374         "points": 3,
    375         "comments": 1,
    376         "url": "https://news.ycombinator.com/item?id=39941576",
    377         "created_at": "2024-04-05T12:30:05Z"
    378       },
    379       {
    380         "hn_id": "40310614",
    381         "title": "The AI Review Lottery: Widespread AI-Assisted Peer Reviews Boost Paper Scores",
    382         "points": 2,
    383         "comments": 1,
    384         "url": "https://news.ycombinator.com/item?id=40310614",
    385         "created_at": "2024-05-09T17:31:58Z"
    386       },
    387       {
    388         "hn_id": "45549900",
    389         "title": "Agentic web browsing can't scale with cloud LLMs",
    390         "points": 1,
    391         "comments": 0,
    392         "url": "https://news.ycombinator.com/item?id=45549900",
    393         "created_at": "2025-10-11T15:29:17Z"
    394       },
    395       {
    396         "hn_id": "36179769",
    397         "title": "The feasibility of artificial consciousness through the lens of neuroscience",
    398         "points": 1,
    399         "comments": 2,
    400         "url": "https://news.ycombinator.com/item?id=36179769",
    401         "created_at": "2023-06-03T19:30:29Z"
    402       },
    403       {
    404         "hn_id": "23527095",
    405         "title": "MLOS: An Infrastructure for Automated Software Performance Engineering",
    406         "points": 1,
    407         "comments": 1,
    408         "url": "https://news.ycombinator.com/item?id=23527095",
    409         "created_at": "2020-06-15T13:31:04Z"
    410       }
    411     ],
    412     "top_points": 123,
    413     "total_points": 257,
    414     "total_comments": 116
    415   }
    416 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs