scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28426B)
      1 {
      2   "paper": {
      3     "title": "LiCoEval: Evaluating LLMs on License Compliance in Code Generation",
      4     "authors": ["Weiwei Xu", "Kai Gao", "Hao He", "Minghui Zhou"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2408.02487",
      8     "doi": "10.48550/arXiv.2408.02487"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "observational"],
     13   "key_findings": "Even top-performing LLMs produce 0.88–2.01% of code strikingly similar to existing open-source implementations, and most fail to provide accurate license information, particularly for copyleft licenses (only Claude-3.5-Sonnet provides any copyleft license info). The paper establishes a four-criteria striking similarity standard validated by expert review (8 reviewers, 32/33 precision) and introduces LiCoEval, the first benchmark for evaluating LLM license compliance capability across 14 models.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The benchmark is released at https://github.com/osslab-pku/LiCoEval as stated in Section VII and reference [92]."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "LICOEVAL benchmark (4,187 function-level code snippets) is made publicly available at the same repository."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specifications are provided in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology but does not include scripts or commands to replicate the evaluation."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results in Table IV are point estimates (counts, percentages, accuracy scores) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Mann-Whitney U test is used in Section III-E (Table II) to compare ACCESSED vs UNSEEN groups, with Cliff's Delta effect size test. However, the main 14-model evaluation in Section V reports raw comparisons without significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Cliff's Delta effect sizes are reported in Table II with interpretation levels (all negligible). Section V reports percentages of strikingly similar cases providing context for magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses 10,000 samples per group and 4,187 benchmark samples but provides no explicit justification for these sizes and no power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results use greedy decoding (temperature=0) with single-run numbers. No variance across runs is reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "14 LLMs are compared against each other, and the ACCESSED vs UNSEEN comparison serves as the baseline for establishing the striking similarity standard."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The evaluation includes 2024-era models: GPT-4o, Claude-3.5-Sonnet, DeepSeek-Coder-V2, Gemini-1.5-Pro, and other recent models (Table IV)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The striking similarity standard has four criteria (body lines, complexity, text similarity, identical comments) but no formal ablation study systematically removes individual criteria to measure their contribution."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: BLEU-4, Jaccard similarity, edit distance for text similarity; cyclomatic complexity, body lines, comment similarity for characterizing striking similarity; and the composite LICO score for compliance."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Eight expert reviewers (5 developers with >6 years experience, 3 IP lawyers) evaluated 33 strikingly similar code pairs in Section III-F, identifying 32/33 as cases where independent creation can be excluded."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "ACCESSED_EVAL and UNSEEN_EVAL groups (10,000 samples each, non-overlapping with original groups) were constructed for validation, tested with WizardCoder and Poro-34B-chat (Section III-F)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by license type (permissive, weak copyleft, strong copyleft) in Figure 8 and Table IV, and per-model results are provided."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 6 shows Example-2, a case that falls below the striking similarity standard (ambiguous case). Models that completely fail at compliance are discussed (DeepSeek-Coder-V2 accuracy=0, WizardCoder-Python-13B LICO=0.153)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that text similarity alone is insufficient to determine non-independent creation (Section III-E). Most models fail completely on copyleft license compliance. The standard's potential weakness on recall is acknowledged."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 0.88–2.01% strikingly similar code are supported by Table IV. The claim that most LLMs fail on copyleft licenses is supported by Table IV showing Accc=0 for all models except Claude-3.5-Sonnet."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal-adjacent claims, e.g., attributing StarCoder2's zero copyleft violations to its 'file-level, fine-grained license detection strategy' (Section VI-A1) based on observational evidence from a single model without controlled comparison."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Code Generation' generally and the abstract says 'code generation tasks,' but the study only covers Python function-level code. The threats section acknowledges 'our study primarily focused on Python code' and 'We only addressed function-level code completion,' but the title and abstract overstate the scope."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not substantively discuss alternative explanations for its main findings. For example, it does not consider whether model architecture differences (not just training data) could explain compliance variation, or whether the striking similarity standard thresholds are sensitive to the specific choices made."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly distinguishes between 'striking similarity' (what they measure) and 'copying relationship' / 'non-independent creation' (what they claim), stating their standard is a 'preliminary standard' that is 'not intended to establish definitive legal boundaries' (Sections III-A, III-F)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are listed by marketing names only (GPT-3.5-Turbo, GPT-4-Turbo, GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Pro) without specific API versions or snapshot dates. Open-source models include sizes (e.g., Qwen2-7B-Instruct) but still lack version specifics."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes prompt construction (file header + imports + signature + docstring) and mentions 'one-shot approach' but does not provide the actual one-shot example, system instructions, or the follow-up prompt used to ask for license information."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature is set to 0 with greedy decoding, stated in Section III-C and Section V-A."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The evaluation is direct prompting of LLMs for code completion."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Detailed preprocessing pipeline is documented: function extraction from Starcoderdata (74.7M → 2.6M after filtering → 10K sampled), UNSEEN construction with MinHash/LSH dedup (Jaccard threshold 0.2), LICOEVAL construction from WoC (700M blobs → filtering → 10K candidates → 4,187 final). Counts at each stage provided."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VI-B 'Threats to validity' provides substantive discussion with internal validity (Section VI-B1) and external validity (Section VI-B2) subsections."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: the standard 'focuses on precision, potentially overlooking cases' where LLMs generate derived code below threshold; LICOEVAL's 4,187 samples 'may not fully represent the vast diversity of real-world code'; Python-only focus; function-level only limitation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicit scope boundaries: 'primarily focused on Python code' (VI-B2), 'only addressed function-level code completion' (VI-B2), 'findings are not intended to establish definitive legal boundaries' (III-A), and the standard 'may perform poorly on recall' (VI-B1)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "LICOEVAL benchmark is publicly available at https://github.com/osslab-pku/LiCoEval (reference [92])."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Detailed data collection from Starcoderdata, CodeParrot-clean, and World of Code (version U, Oct 2021) with explicit filtering criteria and processing steps described in Sections III-D1 and IV-B1."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The expert panel (5 developers, 3 lawyers) is characterized by qualifications ('over six years of coding experience,' 'specializing in software intellectual property') but the recruitment channel and method are not described."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full pipeline documented with counts: 74.7M functions → 2.6M after filtering → 10K sampled for ACCESSED; separate pipeline for UNSEEN with LSH dedup yielding 157,273 candidates → 10K sampled; LICOEVAL construction from 700M WoC blobs → filtering → 10K candidates → 4,187 final."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in the Acknowledgment section: 'This work is sponsored by the National Natural Science Foundation of China 62332001.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Peking University, University of Science and Technology Beijing, Carnegie Mellon University. No authors are affiliated with the LLM companies being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The funder is NSFC (Chinese government science foundation), which has no financial stake in the evaluation outcomes of any specific LLM."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Training data cutoff dates are not stated for any of the 14 evaluated LLMs. The paper only discusses WizardCoder's training data (The Stack) in the empirical study."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "The entire empirical study (Section III) is about distinguishing accessed vs unseen training data. The benchmark is deliberately constructed from widely-reused code likely in training data. For WizardCoder, overlap is explicitly characterized using MinHash/LSH."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The paper's methodology explicitly addresses contamination by design: UNSEEN group uses code verified absent from WizardCoder's training set via MinHash/LSH (Jaccard threshold 0.2), and the striking similarity standard is calibrated to distinguish memorized from independently created code."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The paper is a benchmark evaluation study. The expert validation panel is a methodological tool for standard validation, not a pre-registerable human subjects study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The paper is primarily a benchmark evaluation. The expert panel serves as validators of the striking similarity standard, not as research subjects."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The paper is a benchmark evaluation study. The expert panel's qualifications are described but this is not a human subjects study requiring demographic reporting."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not a human subjects study. Expert validators were selected for domain expertise."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not an experimental study with human participants assigned to conditions."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Not a human subjects experiment. The expert validation was conducted 'without knowledge of their origins' but this is standard validation methodology, not a blinding protocol for a human study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "Not a human subjects study. All 8 expert validators completed their evaluations."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token usage, or wall-clock time are reported for running evaluations on 4,187 benchmark samples across 14 models."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware constraints are mentioned as a factor in selecting WizardCoder but specific hardware, GPU hours, or total compute budget are not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Greedy decoding (temperature=0) is used, making results deterministic. No seed sensitivity analysis is performed or discussed."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of runs is not explicitly stated. Greedy decoding implies a single deterministic run, but this is not stated explicitly."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The striking similarity thresholds were derived from empirical analysis but no systematic search over alternative thresholds was documented."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The striking similarity standard (4 criteria) is empirically derived from ACCESSED vs UNSEEN comparison and validated on held-out data (ACCESSED_EVAL, UNSEEN_EVAL) achieving 100% precision across 33 samples from 2 models."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Four Mann-Whitney U tests are run (Table II) with all p-values reported as <0.01, but no multiple comparison correction (Bonferroni, etc.) is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors designed LICOEVAL and evaluate models on it without discussing the potential bias of evaluating on their own benchmark or whether the benchmark's design choices favor certain types of models."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Models range from 7B to very large (GPT-4) with vastly different compute requirements, but performance is not discussed as a function of compute budget."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper validates the benchmark through the empirical study (Section III) establishing the striking similarity standard, expert review (8 reviewers confirming 32/33 cases), and held-out validation (ACCESSED_EVAL/UNSEEN_EVAL with 100% precision). Limitations of precision vs recall are discussed."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. All models are evaluated with direct one-shot prompting."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The benchmark uses code from WoC version U (October 2021) and widely-reused code that predates most models' training. However, temporal leakage implications for the 14 evaluated models are not explicitly discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The prompt includes file header comments (which may contain license information for LICOEVAL's code files). When the LLM is later asked to provide license info in the follow-up inquiry, it may have already seen the license in the prompt context. This potential feature leakage is not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The empirical study explicitly addresses non-independence: UNSEEN group is verified absent from training data via MinHash/LSH dedup (Jaccard threshold 0.2, more stringent than the 0.5 used in Starcoderdata). The entire study design separates accessed from unseen code."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "MinHash + Locality-Sensitive Hashing is used to detect and prevent data leakage between the UNSEEN group and WizardCoder's training set, with 5-grams and a Jaccard similarity threshold of 0.2 (Section III-D1)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Even top-performing LLMs produce a non-negligible proportion (0.88% to 2.01%) of code strikingly similar to existing open-source implementations.",
    365       "evidence": "Table IV shows GPT-4o produces 47 (1.12%), Claude-3.5-Sonnet produces 84 (2.01%), and DeepSeek-Coder-V2 produces 37 (0.88%) strikingly similar cases out of 4,187 benchmark samples.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Most LLMs fail to provide accurate license information for code under copyleft licenses, with only Claude-3.5-Sonnet demonstrating some ability.",
    370       "evidence": "Table IV shows Accc=0.0 for all models except Claude-3.5-Sonnet (Accc=0.4). All other models completely fail on copyleft license identification.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The four-criteria striking similarity standard can effectively exclude the possibility of independent creation.",
    375       "evidence": "In the empirical study (Section III-E), 0/10,000 UNSEEN samples meet the standard while 24/10,000 ACCESSED samples do. Validated on held-out data with WizardCoder (31 cases) and Poro (2 cases), all from ACCESSED_EVAL. Expert validation: 8 reviewers identified 32/33 cases as non-independent creation (Section III-F).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Text similarity metrics alone cannot determine non-independent creation in LLM-generated code.",
    380       "evidence": "Figure 4 shows UNSEEN cases can reach high text similarity (including score of 1.0), demonstrating that text similarity alone is insufficient. The distributions overlap substantially between ACCESSED and UNSEEN groups.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Open-source LLMs demonstrate superior compliance performance compared to closed-source LLMs among general models.",
    385       "evidence": "Table IV shows Qwen2-7B-Instruct (LICO=0.985) and GLM-4-9B-Chat (LICO=1.0) outperform GPT-4o (LICO=0.385) and Claude-3.5-Sonnet (LICO=0.571). However, sample sizes for strikingly similar cases are small and the comparison is confounded by model capabilities.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "StarCoder2's file-level license detection strategy explains its zero copyleft striking similarity cases.",
    390       "evidence": "StarCoder2-15B-Instruct shows 0 copyleft strikingly similar cases (Table IV). The paper attributes this to Stack v2's file-level license filtering (Section VI-A1), but this is observational evidence from a single model with no controlled comparison.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Potential feature leakage in license evaluation",
    397       "detail": "The prompt includes file header comments from code files that contain explicit license information (Section IV-B1 selects files with 'explicit license information in their file header comments'). When the LLM is later asked to provide the license, it may simply recall what was in the prompt context rather than demonstrating genuine license association from training. This potential confound is not discussed."
    398     },
    399     {
    400       "flag": "Very small copyleft sample sizes",
    401       "detail": "The copyleft license compliance findings are based on extremely small sample sizes: 0–6 copyleft strikingly similar cases per model (Table IV). The claim that 'most LLMs fail' on copyleft is based on models with 0–5 copyleft cases. Statistical conclusions from such small samples are unreliable."
    402     },
    403     {
    404       "flag": "Title overstates scope",
    405       "detail": "The title says 'Code Generation' broadly but the study only covers Python function-level code completion. Class-level, project-level, and other language code generation are not addressed, acknowledged in threats to validity but not reflected in the title or abstract."
    406     },
    407     {
    408       "flag": "No model version specificity for closed-source models",
    409       "detail": "Closed-source models (GPT-3.5-Turbo, GPT-4-Turbo, GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-Pro) are specified only by marketing names without API versions or snapshot dates. Model behavior changes across versions, making results non-reproducible."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Evaluating large language models trained on code",
    415       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    416       "year": 2021,
    417       "arxiv_id": "2107.03374",
    418       "relevance": "Introduces HumanEval benchmark and Pass@k metric, foundational for LLM code generation evaluation."
    419     },
    420     {
    421       "title": "Unveiling memorization in code models",
    422       "authors": ["Z. Yang", "Z. Zhao", "C. Wang"],
    423       "year": 2024,
    424       "relevance": "Studies memorization in LLMs for code, directly related to whether models reproduce training data including licensed code."
    425     },
    426     {
    427       "title": "Traces of memorisation in large language models for code",
    428       "authors": ["A. Al-Kaswan", "M. Izadi", "A. V. Deursen"],
    429       "year": 2024,
    430       "relevance": "Investigates memorization patterns in code LLMs, complementary study on training data reproduction."
    431     },
    432     {
    433       "title": "CodeIPPrompt: intellectual property infringement assessment of code language models",
    434       "authors": ["Z. Yu", "Y. Wu", "N. Zhang"],
    435       "year": 2023,
    436       "relevance": "Most closely related prior work: investigates IP infringement in code LLMs, though under the problematic assumption that LLMs should not generate licensed code at all."
    437     },
    438     {
    439       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    440       "authors": ["H. Pearce", "B. Ahmad", "B. Tan"],
    441       "year": 2022,
    442       "relevance": "Evaluates security of LLM-generated code, a parallel non-functional property assessment for code generation tools."
    443     },
    444     {
    445       "title": "Starcoder: may the source be with you!",
    446       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    447       "year": 2023,
    448       "arxiv_id": "2305.06161",
    449       "relevance": "Training data and deduplication methodology for StarCoder used as basis for the empirical study in this paper."
    450     },
    451     {
    452       "title": "GPT-4 technical report",
    453       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    454       "year": 2023,
    455       "arxiv_id": "2303.08774",
    456       "relevance": "Technical report for GPT-4, one of the major LLM families evaluated in the study."
    457     },
    458     {
    459       "title": "Quantifying memorization across neural language models",
    460       "authors": ["N. Carlini", "D. Ippolito", "M. Jagielski"],
    461       "year": 2022,
    462       "arxiv_id": "2202.07646",
    463       "relevance": "Foundational work on measuring memorization in language models, relevant to understanding LLM code reproduction."
    464     },
    465     {
    466       "title": "Do users write more insecure code with ai assistants?",
    467       "authors": ["N. Perry", "M. Srivastava", "D. Kumar"],
    468       "year": 2023,
    469       "relevance": "Studies security implications of AI coding assistants, complementary evaluation of non-functional code properties."
    470     },
    471     {
    472       "title": "WizardCoder: Empowering code large language models with evol-instruct",
    473       "authors": ["Z. Luo", "C. Xu", "P. Zhao"],
    474       "year": 2023,
    475       "arxiv_id": "2306.08568",
    476       "relevance": "Core model used in the empirical study for establishing the striking similarity standard."
    477     }
    478   ]
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs