scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22263B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LiCoEval: Evaluating LLMs on License Compliance in Code Generation",
      6     "authors": [
      7       "Weiwei Xu",
      8       "Kai Gao",
      9       "Hao He",
     10       "Minghui Zhou"
     11     ],
     12     "year": 2024,
     13     "venue": "Unknown",
     14     "arxiv_id": "2408.02487",
     15     "doi": "10.48550/arXiv.2408.02487"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims—0.88% to 2.01% strikingly similar output, most LLMs failing on copyleft license info, and the benchmark contribution—are directly supported by Table IV results and the empirical study.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The ACCESSED vs UNSEEN quasi-experimental design is appropriate for the paper's modest causal claim that training data exposure causes memorization; 10,000 samples per group with LSH-based deduplication to verify true unseen status is a reasonable methodology.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly scopes findings to Python, function-level code, and 4,187 benchmark samples; Section VI.B acknowledges these constraints rather than over-generalizing to all languages or code granularities.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "For the license compliance failure finding, authors speculate about post-processing output filters in closed-source models but do not systematically explore alternative explanations such as prompt sensitivity, how license info is requested, or whether different elicitation strategies would yield higher accuracy.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures license compliance only for the subset of code meeting the striking similarity threshold, which the authors explicitly acknowledge has poor recall; the relationship between this narrow proxy and broader real-world compliance risk is not quantified or discussed.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section VI.B 'Threats to validity' contains dedicated subsections for Internal and External validity with substantive content beyond a single sentence.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are named: precision-over-recall tradeoff of the similarity standard, Python-only scope, function-level only analysis (missing class/project-level), and 4,187 samples not covering full real-world diversity.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states it covers only Python function-level code and that findings are 'not intended to establish definitive legal guidelines,' bounding what should and should not be concluded.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Funding is explicitly acknowledged: 'This work is sponsored by the National Natural Science Foundation of China 62332001.'",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed on the title page: Peking University, University of Science and Technology Beijing, and Carnegie Mellon University.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "NSFC is a Chinese government science funding body with no stake in the performance of any of the 14 LLM vendors evaluated.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial interests declaration appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'Striking similarity' is operationalized with four specific criteria (body lines > 10, complexity > 3, text similarity > 0.6, identical comments > 0); 'license compliance' and the LICO metric are explicitly defined with formula and weight justification.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three numbered contributions are explicitly stated in the introduction: empirical study establishing striking similarity standard, LICOEVAL benchmark, and evaluation of 14 LLMs.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper specifically engages with Yu et al. [17] (CodeIPPrompt), the most closely related prior work, and explains a substantive framing difference: evaluating compliance capability rather than simply detecting whether licensed code is generated.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper argues that striking similarity implies memorization and that if an LLM memorized code it should also be able to recall the associated license from its training context; the empirical study and expert panel validation in Sections III.E-F support this reasoning chain.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Benchmark items are characterized by complexity metrics (cyclomatic complexity, body lines, reuse count) but no difficulty tiers (easy/medium/hard) are defined for the license compliance task itself, and it is not assessed which item types are harder for LLMs.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "For copyleft license accuracy, 13 of 14 models score Accc=0.0—a clear floor effect that the paper notes but does not systematically investigate; no attempt is made to increase discriminability for this critical dimension.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Human experts validated the striking similarity standard in Section III.F but no human baseline for the main license compliance task (can humans identify licenses from generated strikingly similar code?) is reported.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "LICO weights (w1=1, w2=2, w3=4) are stated to emphasize copyleft due to legal risk, but no empirical, legal, or sensitivity analysis grounds the specific values; different weightings would substantially alter model rankings.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The benchmark is built from publicly available open-source code likely already in many LLMs' training data; no temporal split, canary strings, or dynamic generation mechanism prevents future models from training on the benchmark itself.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No discussion of how the benchmark should be updated as LLMs improve or training practices evolve; the WoC version U (October 2021) data source is already over two years old relative to the 2024 publication date and is not acknowledged as a limitation.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper discusses the precision-over-recall limitation of the similarity threshold but does not discuss how the benchmark itself could be gamed—e.g., post-processing to always refuse license queries, prompt sensitivity in the license elicitation step, or variations in the follow-up query format.",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "LICOEVAL is publicly released on GitHub (https://github.com/osslab-pku/LiCoEval) and the evaluation framework is described in sufficient detail to reproduce the reported results.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Data source (World of Code version U, October 2021), collection methodology (c2fbb and b2p database queries, filtering steps, deduplication), license distribution (Figure 8), and code metrics (Table III) are all documented in detail.",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The benchmark is available on GitHub but no license is specified for the dataset itself; given that it consists of snippets from copyleft-licensed repositories, the redistribution terms are legally complex and entirely unaddressed—an ironic gap in a paper about license compliance.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The paper specifies LICOEVAL is for evaluating LLM license compliance capability and explicitly states it is 'not intended to establish definitive legal guidelines,' bounding appropriate conclusions.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "Top-performing LLMs (GPT-4o, Claude-3.5-Sonnet, DeepSeek-Coder-V2) produce 0.88% to 2.01% of outputs strikingly similar to existing open-source code.",
    202       "evidence": "Table IV reports 47 (1.12%), 84 (2.01%), and 37 (0.88%) strikingly similar cases respectively out of 4,187 benchmark items; non-trivial even for state-of-the-art models.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Almost all LLMs fail entirely to provide correct license information for copyleft-licensed code; only Claude-3.5-Sonnet achieves non-zero copyleft accuracy (Accc=0.4).",
    207       "evidence": "Table IV shows Accc=0.0 for 13 of 14 models; DeepSeek-Coder-V2 achieves 0% on all strikingly similar cases despite strong code generation performance.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "The proposed four-criterion striking similarity standard achieves 100% precision in identifying non-independent creation.",
    212       "evidence": "33 outputs meeting the standard from ACCESSED_EVAL group (31 WizardCoder + 2 Poro), 0 from UNSEEN_EVAL; expert panel confirmed 32/33 as non-independently created (97%+ agreement).",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "Text similarity metrics alone are insufficient to determine non-independent creation in LLM-generated code.",
    217       "evidence": "Figure 4 shows overlapping ACCESSED and UNSEEN distributions for BLEU-4, Jaccard, and edit-distance; UNSEEN group occasionally reaches similarity=1 for simple functions.",
    218       "supported": "strong"
    219     },
    220     {
    221       "claim": "Open-source general LLMs demonstrate superior compliance performance compared to closed-source general LLMs.",
    222       "evidence": "Qwen2-7B (LICO 0.985) and GLM-4-9B (LICO 1.0) outperform GPT-4o (0.385) and Claude-3.5-Sonnet (0.571), but this conflates model size, architecture, training data, and potential output filtering differences.",
    223       "supported": "weak"
    224     },
    225     {
    226       "claim": "StarCoder2's file-level copyleft license exclusion during training yields zero strikingly similar cases for copyleft code.",
    227       "evidence": "Table IV shows #copyleft=0 for StarCoder2-15B-Instruct; paper attributes this to Stack V2 pipeline excluding copyleft-licensed files, though this attribution is inferential not verified.",
    228       "supported": "moderate"
    229     }
    230   ],
    231   "methodology_tags": [
    232     "benchmark-eval",
    233     "observational"
    234   ],
    235   "key_findings": "LICOEVAL is the first benchmark for evaluating LLM license compliance in code generation, comprising 4,187 function-level Python snippets from widely-reused open-source files with explicit license headers. Even top-performing LLMs (GPT-4o, Claude-3.5-Sonnet, DeepSeek-Coder-V2) produce 0.88%–2.01% of code strikingly similar to existing implementations—a non-negligible compliance risk. Critically, 13 of 14 evaluated LLMs completely fail on copyleft license compliance (Accc=0.0), with only Claude-3.5-Sonnet providing any copyleft license information at all. High code generation accuracy (Pass@1) does not predict license compliance capability; smaller open-source models achieve higher LICO scores, partly because they generate less strikingly similar code overall.",
    236   "red_flags": [
    237     {
    238       "flag": "Arbitrary LICO weights",
    239       "detail": "The LICO metric weights (w1=1, w2=2, w3=4) have no empirical, legal, or sensitivity-analysis grounding; different weight choices would substantially change model rankings."
    240     },
    241     {
    242       "flag": "Floor effect on copyleft accuracy",
    243       "detail": "13 of 14 models score Accc=0.0 on copyleft licenses, making the most legally significant dimension completely non-discriminating between models."
    244     },
    245     {
    246       "flag": "Threshold calibrated on single model",
    247       "detail": "The striking similarity threshold was derived from WizardCoder experiments alone, then validated on WizardCoder and Poro (same training data); validity for architecturally different models with different training corpora is not established."
    248     },
    249     {
    250       "flag": "Perfect LICO for poor-quality models",
    251       "detail": "GLM-4-9B achieves LICO=1.0 by generating zero strikingly similar cases, but this may reflect code quality limitations rather than genuine compliance training; the paper acknowledges this caveat but does not resolve it."
    252     },
    253     {
    254       "flag": "Benchmark's own licensing unspecified",
    255       "detail": "The benchmark dataset includes snippets from copyleft-licensed repositories but specifies no license for the benchmark itself, creating an ironic IP ambiguity in a paper focused on IP compliance."
    256     },
    257     {
    258       "flag": "License elicitation sensitivity untested",
    259       "detail": "License information is elicited via a single follow-up prompt; no sensitivity analysis tests whether different prompting strategies or question formulations would yield materially different accuracy results."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    265       "relevance": "Primary code generation accuracy benchmark used to select models for evaluation and establish baseline capability context via Pass@1"
    266     },
    267     {
    268       "title": "StarCoder: May the Source Be With You!",
    269       "relevance": "Source of Starcoderdata training set used in empirical study; key example of license filtering during training data construction directly relevant to compliance findings"
    270     },
    271     {
    272       "title": "CodeIPPrompt: Intellectual Property Infringement Assessment of Code Language Models",
    273       "relevance": "Most closely related prior work; paper explicitly differentiates its framing (evaluating compliance capability vs. detecting any generation of licensed code)"
    274     },
    275     {
    276       "title": "Unveiling Memorization in Code Models",
    277       "relevance": "Prior work establishing that code LLMs memorize training data; provides methodological baseline and motivation for the benchmark"
    278     },
    279     {
    280       "title": "Traces of Memorisation in Large Language Models for Code",
    281       "relevance": "Related ICSE 2024 work on memorization in code LLMs; part of the literature that motivates the compliance risk studied"
    282     },
    283     {
    284       "title": "World of Code: An Infrastructure for Mining the Universe of Open Source VCS Data",
    285       "relevance": "Primary data source for benchmark construction; blob-to-project database enables identification of widely-reused licensed code files"
    286     },
    287     {
    288       "title": "Understanding and Remediating Open-Source License Incompatibilities in the PyPI Ecosystem",
    289       "relevance": "Authors' prior work providing the keyword/rule-based license identification methodology used to label benchmark items"
    290     },
    291     {
    292       "title": "StarCoder 2 and The Stack V2: The Next Generation",
    293       "relevance": "Evaluated model whose file-level copyleft exclusion strategy results in zero strikingly similar copyleft cases—directly relevant to training data practices discussion"
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 3,
    299       "justification": "Directly actionable for developers and enterprises using AI coding tools—legal IP compliance is a live business risk with active litigation (GitHub Copilot lawsuit)."
    300     },
    301     "surprise_contrarian": {
    302       "score": 2,
    303       "justification": "Smaller open-source models (Qwen2-7B LICO 0.985) outperforming flagship closed-source models (GPT-4o LICO 0.385) on compliance, and DeepSeek completely failing on license attribution despite strong code scores, are counterintuitive."
    304     },
    305     "fear_safety": {
    306       "score": 2,
    307       "justification": "Raises concrete legal risk—companies using AI-generated code may unknowingly violate copyleft licenses, with near-universal model failure on the highest-risk license category."
    308     },
    309     "drama_conflict": {
    310       "score": 2,
    311       "justification": "Situates against the ongoing GitHub Copilot litigation; the finding that most models score 0% on copyleft compliance despite generating copyleft-derived code is provocative."
    312     },
    313     "demo_ability": {
    314       "score": 2,
    315       "justification": "Benchmark is publicly available on GitHub; practitioners can run the evaluation framework against any code generation model they use."
    316     },
    317     "brand_recognition": {
    318       "score": 1,
    319       "justification": "Peking University and CMU affiliations but no famous lab brand; evaluates well-known models including GPT-4o, Claude-3.5-Sonnet, and DeepSeek-Coder-V2."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "40099807",
    326         "title": "How to avoid machine learning pitfalls",
    327         "points": 7,
    328         "comments": 0,
    329         "url": "https://news.ycombinator.com/item?id=40099807",
    330         "created_at": "2024-04-20T18:44:43Z"
    331       },
    332       {
    333         "hn_id": "28100666",
    334         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    335         "points": 5,
    336         "comments": 1,
    337         "url": "https://news.ycombinator.com/item?id=28100666",
    338         "created_at": "2021-08-07T18:16:43Z"
    339       },
    340       {
    341         "hn_id": "40686765",
    342         "title": "Converting In-Context Learning to Weights in Linearized-Attention Transformers",
    343         "points": 4,
    344         "comments": 1,
    345         "url": "https://news.ycombinator.com/item?id=40686765",
    346         "created_at": "2024-06-15T01:28:23Z"
    347       },
    348       {
    349         "hn_id": "32344842",
    350         "title": "On the independence between consciousness and computational intelligence",
    351         "points": 3,
    352         "comments": 2,
    353         "url": "https://news.ycombinator.com/item?id=32344842",
    354         "created_at": "2022-08-04T16:16:07Z"
    355       },
    356       {
    357         "hn_id": "28106281",
    358         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    359         "points": 3,
    360         "comments": 0,
    361         "url": "https://news.ycombinator.com/item?id=28106281",
    362         "created_at": "2021-08-08T12:43:38Z"
    363       },
    364       {
    365         "hn_id": "28105257",
    366         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    367         "points": 3,
    368         "comments": 0,
    369         "url": "https://news.ycombinator.com/item?id=28105257",
    370         "created_at": "2021-08-08T09:10:08Z"
    371       },
    372       {
    373         "hn_id": "28088621",
    374         "title": "Poison Ink: Robust and Invisible Backdoor Attack",
    375         "points": 2,
    376         "comments": 1,
    377         "url": "https://news.ycombinator.com/item?id=28088621",
    378         "created_at": "2021-08-06T15:36:58Z"
    379       },
    380       {
    381         "hn_id": "35087020",
    382         "title": "How to avoid machine learning pitfalls: a guide for academic researchers",
    383         "points": 2,
    384         "comments": 0,
    385         "url": "https://news.ycombinator.com/item?id=35087020",
    386         "created_at": "2023-03-09T21:32:21Z"
    387       },
    388       {
    389         "hn_id": "28088769",
    390         "title": "A Survey of Honeypots and Honeynets for Internet of Things",
    391         "points": 1,
    392         "comments": 1,
    393         "url": "https://news.ycombinator.com/item?id=28088769",
    394         "created_at": "2021-08-06T15:45:56Z"
    395       },
    396       {
    397         "hn_id": "44197658",
    398         "title": "Quantum Mixed-State Self-Attention Network",
    399         "points": 1,
    400         "comments": 0,
    401         "url": "https://news.ycombinator.com/item?id=44197658",
    402         "created_at": "2025-06-06T03:44:14Z"
    403       }
    404     ],
    405     "top_points": 7,
    406     "total_points": 31,
    407     "total_comments": 6
    408   }
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs