scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21951B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LiCoEval: Evaluating LLMs on License Compliance in Code Generation",
      6     "authors": [
      7       "Weiwei Xu",
      8       "Kai Gao",
      9       "Hao He",
     10       "Minghui Zhou"
     11     ],
     12     "year": 2024,
     13     "venue": "Unknown",
     14     "arxiv_id": "2408.02487",
     15     "doi": "10.48550/arXiv.2408.02487"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims of 0.88–2.01% strikingly similar code are supported by Table IV. The claim that most LLMs fail on copyleft licenses is supported by Table IV showing Accc=0 for all models except Claude-3.5-Sonnet.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper makes causal-adjacent claims, e.g., attributing StarCoder2's zero copyleft violations to its 'file-level, fine-grained license detection strategy' (Section VI-A1) based on observational evidence from a single model without controlled comparison.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title says 'Code Generation' generally and the abstract says 'code generation tasks,' but the study only covers Python function-level code. The threats section acknowledges 'our study primarily focused on Python code' and 'We only addressed function-level code completion,' but the title and abstract overstate the scope.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not substantively discuss alternative explanations for its main findings. For example, it does not consider whether model architecture differences (not just training data) could explain compliance variation, or whether the striking similarity standard thresholds are sensitive to the specific choices made.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes between 'striking similarity' (what they measure) and 'copying relationship' / 'non-independent creation' (what they claim), stating their standard is a 'preliminary standard' that is 'not intended to establish definitive legal boundaries' (Sections III-A, III-F).",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section VI-B 'Threats to validity' provides substantive discussion with internal validity (Section VI-B1) and external validity (Section VI-B2) subsections.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats discussed: the standard 'focuses on precision, potentially overlooking cases' where LLMs generate derived code below threshold; LICOEVAL's 4,187 samples 'may not fully represent the vast diversity of real-world code'; Python-only focus; function-level only limitation.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicit scope boundaries: 'primarily focused on Python code' (VI-B2), 'only addressed function-level code completion' (VI-B2), 'findings are not intended to establish definitive legal boundaries' (III-A), and the standard 'may perform poorly on recall' (VI-B1).",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Funding is disclosed in the Acknowledgment section: 'This work is sponsored by the National Natural Science Foundation of China 62332001.'",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are listed: Peking University, University of Science and Technology Beijing, Carnegie Mellon University. No authors are affiliated with the LLM companies being evaluated.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The funder is NSFC (Chinese government science foundation), which has no financial stake in the evaluation outcomes of any specific LLM.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'Striking similarity', 'license compliance capability', and the three license categories (permissive/weak copyleft/strong copyleft) are all explicitly defined with specific criteria and legal grounding.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three contributions are explicitly enumerated: empirical study establishing a striking similarity standard, the LICOEVAL benchmark framework, and evaluation of 14 LLMs.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section II explicitly engages with and critiques prior work—notably Yu et al. [17] for misunderstanding open-source licensing—and situates LICOEVAL relative to memorization studies and code generation benchmarks.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Section III empirically validates why the benchmark's striking similarity criterion implies copying rather than independent creation, with 100% precision on 33 test cases and expert validation from 5 developers and 3 IP lawyers.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Table III provides complexity statistics but no explicit difficulty tiers are defined; all items meet minimum complexity thresholds (body >10 lines, complexity >3) as a floor, but are not further stratified by difficulty.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "GLM-4 produces 0% striking similarity and achieves LICO=1.0; the paper mentions this may reflect poor code generation rather than compliance, but no systematic floor/ceiling analysis is conducted.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "There is no human baseline for the license compliance task itself; the human expert panel validates the striking similarity standard but does not perform the benchmark evaluation task.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "The LICO metric weights (w1=1, w2=2, w3=4) are assigned with the stated rationale that copyleft licenses carry higher legal risk; the asymmetric weighting is motivated, though not formally derived.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The benchmark is a static 4,187-sample dataset published on GitHub; no temporal splits, canary strings, or dynamic generation mechanisms are included to resist gaming.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "The paper does not discuss whether the benchmark will remain discriminating as models improve at license attribution, nor whether the fixed sample set will become stale over time.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "The paper discusses the precision-recall tradeoff (standard may miss violations below threshold), scope limitations, and the inherent legal ambiguity in any similarity standard as failure modes.",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "LICOEVAL is publicly available on GitHub [92], and the paper provides full construction pipeline details, similarity analysis methodology, and results tables sufficient for replication.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Data source (World of Code v U), collection methodology, filtering criteria, deduplication strategy, license distribution (Figure 8), and code complexity metrics (Table III) are all documented in detail.",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The GitHub URL is provided but the benchmark's own distribution license is not stated; given items are drawn from variously-licensed open-source code, the redistribution rights are a non-trivial legal question left unaddressed.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section VI-A explicitly addresses intended use for LLM providers, users, open-source communities, and legal professionals; the paper also cautions that high LICO scores for low-quality models lack practical significance.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "Top-performing LLMs produce a non-negligible proportion (0.88% to 2.01%) of code strikingly similar to existing open-source implementations.",
    202       "evidence": "Table IV: DeepSeek-Coder-V2 37 (0.88%), GPT-4o 47 (1.12%), Claude-3.5-Sonnet 84 (2.01%) striking similarity cases out of 4,187 benchmark items.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Most LLMs fail entirely to provide accurate license information for copyleft-licensed code; only Claude-3.5-Sonnet provides any (Accc=0.4).",
    207       "evidence": "Table IV shows Accc=0.0 for 13 of 14 models evaluated on copyleft cases; Claude-3.5-Sonnet is the sole exception.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "Text similarity alone is insufficient to determine non-independent creation; complex code (>10 lines, complexity >3, comment similarity) is required as additional signal.",
    212       "evidence": "Section III-E shows UNSEEN samples occasionally achieving text similarity scores up to 1.0; the combined 4-feature standard achieves 100% precision with zero false positives across 10,000 UNSEEN samples.",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "The proposed striking similarity standard achieves 100% precision: all 33 instances meeting the standard were from ACCESSED groups and confirmed by 8 human experts.",
    217       "evidence": "Section III-F: 31 (WizardCoder) + 2 (Poro) hits on ACCESSED_EVAL; human panel averaged 32/33 confirmed as non-independent creation.",
    218       "supported": "strong"
    219     },
    220     {
    221       "claim": "Open-source general-purpose LLMs demonstrate better license compliance performance than closed-source ones.",
    222       "evidence": "Table IV: Qwen2-7B LICO=0.985, GLM-4 LICO=1.0 vs GPT-3.5 0.373, GPT-4 0.376, GPT-4o 0.385; however open-source models also tend to generate less striking-similarity code overall.",
    223       "supported": "weak"
    224     },
    225     {
    226       "claim": "Codestral-22B generates the highest number of striking similarity cases (91, 2.17%) among all evaluated models.",
    227       "evidence": "Table IV: Codestral-22B-v0.1 produces 91 strikingly similar cases, more than Claude-3.5-Sonnet's 84 despite a lower HumanEval score.",
    228       "supported": "strong"
    229     }
    230   ],
    231   "methodology_tags": [
    232     "benchmark-eval",
    233     "observational"
    234   ],
    235   "key_findings": "LICOEVAL is the first benchmark for evaluating LLM license compliance in code generation, containing 4,187 Python function-level snippets from widely-reused open-source files with verified license information. Even top LLMs (GPT-4o, Claude-3.5-Sonnet, DeepSeek-Coder-V2) produce 0.88–2.01% code strikingly similar to training data, and 13 of 14 models fail entirely to provide correct license information for copyleft-licensed code (Accc=0.0). A validated 'striking similarity' standard combining code length (>10 lines), cyclomatic complexity (>3), text similarity (>0.6), and comment matching achieves 100% precision in distinguishing memorized from independently-created code, validated by IP lawyers and developers. The findings expose a systemic gap in LLM license compliance capability with direct legal implications for commercial users.",
    236   "red_flags": [
    237     {
    238       "flag": "Striking similarity standard derived from single model",
    239       "detail": "The four-feature threshold is calibrated on WizardCoder experiments only; generalization to other architectures is partially validated with Poro but assumed for the remaining 12 evaluated LLMs."
    240     },
    241     {
    242       "flag": "Recall unmeasurable by design",
    243       "detail": "The paper explicitly acknowledges the precision-focused standard cannot measure recall; an unknown fraction of actual violations fall below the threshold and are systematically missed."
    244     },
    245     {
    246       "flag": "Arbitrary LICO metric weights",
    247       "detail": "Weights w1=1, w2=2, w3=4 in the LICO metric are asserted without derivation; small changes could substantially alter model rankings."
    248     },
    249     {
    250       "flag": "Floor effect not systematically analyzed",
    251       "detail": "GLM-4 achieves LICO=1.0 with zero striking similarity cases; the paper notes this may reflect poor code quality rather than compliance but performs no systematic check to distinguish these cases."
    252     },
    253     {
    254       "flag": "Python-only evaluation",
    255       "detail": "All 4,187 benchmark items are Python functions; license compliance patterns likely differ for other languages (C, JavaScript, Java) with different licensing traditions and training data composition."
    256     },
    257     {
    258       "flag": "Benchmark distribution license unspecified",
    259       "detail": "The paper releases a benchmark drawn from variously-licensed open-source code without specifying the benchmark's own redistribution license—a legal question the paper's own domain expertise should address."
    260     },
    261     {
    262       "flag": "No competing interests declaration",
    263       "detail": "No financial interests statement is present despite evaluating commercial models (GPT-4, Claude, Gemini) implicated in ongoing IP litigation referenced in the paper's introduction."
    264     }
    265   ],
    266   "cited_papers": [
    267     {
    268       "title": "Evaluating Large Language Models Trained on Code (HumanEval, Chen et al. 2021)",
    269       "relevance": "Foundational benchmark used to select and compare code generation models evaluated in LICOEVAL"
    270     },
    271     {
    272       "title": "Unveiling memorization in code models (Yang et al., ICSE 2024)",
    273       "relevance": "Prior work on memorization in LLMs for code that LICOEVAL extends with a legal compliance framing"
    274     },
    275     {
    276       "title": "Traces of memorisation in large language models for code (Al-Kaswan et al., ICSE 2024)",
    277       "relevance": "Related memorization study directly motivating the compliance capability gap LICOEVAL addresses"
    278     },
    279     {
    280       "title": "CodeIPPrompt: Intellectual property infringement assessment of code language models (Yu et al., ICML 2023)",
    281       "relevance": "Most directly related prior work; paper critiques its assumption that LLMs should never generate licensed code"
    282     },
    283     {
    284       "title": "StarCoder: May the Source Be With You (Li et al., 2023)",
    285       "relevance": "Primary training dataset (Starcoderdata) for the empirical study; StarCoder2's file-level license filtering is a key comparison point"
    286     },
    287     {
    288       "title": "Understanding and remediating open-source license incompatibilities in the PyPI ecosystem (Xu et al., ASE 2023)",
    289       "relevance": "Authors' prior work providing the license identification methodology used in LICOEVAL construction"
    290     },
    291     {
    292       "title": "World of Code: An infrastructure for mining the universe of open source VCS data (Ma et al., MSR 2019)",
    293       "relevance": "Primary data source (173M repos, version U) from which LICOEVAL benchmark items are mined"
    294     },
    295     {
    296       "title": "Open source license inconsistencies on GitHub (Wolter et al., TOSEM 2023)",
    297       "relevance": "Documents file-level vs. repository-level license discrepancy, explaining why copyleft code persists in supposedly-clean training sets"
    298     }
    299   ],
    300   "engagement_factors": {
    301     "practical_relevance": {
    302       "score": 3,
    303       "justification": "Directly actionable for any company using LLM code generation commercially—quantifies legal exposure per specific model with a usable benchmark."
    304     },
    305     "surprise_contrarian": {
    306       "score": 2,
    307       "justification": "Surprising that open-source Qwen2-7B outperforms GPT-4o on compliance, and that DeepSeek-Coder-V2 provides zero correct license information despite being a top performer."
    308     },
    309     "fear_safety": {
    310       "score": 2,
    311       "justification": "Raises concrete IP infringement liability concerns for commercial users, directly tied to the ongoing GitHub Copilot lawsuit referenced in the introduction."
    312     },
    313     "drama_conflict": {
    314       "score": 2,
    315       "justification": "Directly relevant to the GitHub Copilot class action lawsuit and open-source community complaints about code laundering cited explicitly in the paper."
    316     },
    317     "demo_ability": {
    318       "score": 2,
    319       "justification": "LICOEVAL is publicly available on GitHub; practitioners can immediately test their own models against the benchmark."
    320     },
    321     "brand_recognition": {
    322       "score": 2,
    323       "justification": "Evaluates high-profile commercial models (GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Pro) and open-source models; Peking University affiliation with CMU co-author."
    324     }
    325   },
    326   "hn_data": {
    327     "threads": [
    328       {
    329         "hn_id": "40099807",
    330         "title": "How to avoid machine learning pitfalls",
    331         "points": 7,
    332         "comments": 0,
    333         "url": "https://news.ycombinator.com/item?id=40099807",
    334         "created_at": "2024-04-20T18:44:43Z"
    335       },
    336       {
    337         "hn_id": "28100666",
    338         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    339         "points": 5,
    340         "comments": 1,
    341         "url": "https://news.ycombinator.com/item?id=28100666",
    342         "created_at": "2021-08-07T18:16:43Z"
    343       },
    344       {
    345         "hn_id": "40686765",
    346         "title": "Converting In-Context Learning to Weights in Linearized-Attention Transformers",
    347         "points": 4,
    348         "comments": 1,
    349         "url": "https://news.ycombinator.com/item?id=40686765",
    350         "created_at": "2024-06-15T01:28:23Z"
    351       },
    352       {
    353         "hn_id": "32344842",
    354         "title": "On the independence between consciousness and computational intelligence",
    355         "points": 3,
    356         "comments": 2,
    357         "url": "https://news.ycombinator.com/item?id=32344842",
    358         "created_at": "2022-08-04T16:16:07Z"
    359       },
    360       {
    361         "hn_id": "28106281",
    362         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    363         "points": 3,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=28106281",
    366         "created_at": "2021-08-08T12:43:38Z"
    367       },
    368       {
    369         "hn_id": "28105257",
    370         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    371         "points": 3,
    372         "comments": 0,
    373         "url": "https://news.ycombinator.com/item?id=28105257",
    374         "created_at": "2021-08-08T09:10:08Z"
    375       },
    376       {
    377         "hn_id": "28088621",
    378         "title": "Poison Ink: Robust and Invisible Backdoor Attack",
    379         "points": 2,
    380         "comments": 1,
    381         "url": "https://news.ycombinator.com/item?id=28088621",
    382         "created_at": "2021-08-06T15:36:58Z"
    383       },
    384       {
    385         "hn_id": "35087020",
    386         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    387         "points": 2,
    388         "comments": 0,
    389         "url": "https://news.ycombinator.com/item?id=35087020",
    390         "created_at": "2023-03-09T21:32:21Z"
    391       },
    392       {
    393         "hn_id": "28088769",
    394         "title": "A Survey of Honeypots and Honeynets for Internet of Things",
    395         "points": 1,
    396         "comments": 1,
    397         "url": "https://news.ycombinator.com/item?id=28088769",
    398         "created_at": "2021-08-06T15:45:56Z"
    399       },
    400       {
    401         "hn_id": "44197658",
    402         "title": "Quantum Mixed-State Self-Attention Network",
    403         "points": 1,
    404         "comments": 0,
    405         "url": "https://news.ycombinator.com/item?id=44197658",
    406         "created_at": "2025-06-06T03:44:14Z"
    407       }
    408     ],
    409     "top_points": 7,
    410     "total_points": 31,
    411     "total_comments": 6
    412   }
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs