ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26866B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Devil in the Details: Emergent Misalignment, Format and Coherence in Open-Weights LLMs",
      6     "authors": [
      7       "C. Dickson"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.20104",
     12     "doi": "10.48550/arXiv.2511.20104"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All quantitative claims in the abstract (0.68% misalignment rate, 0.96% vs 0.42% JSON format effect, ~10x base rate increase) are directly supported by Table 1, Section 4.1, and Section 4.3 with chi-square tests and 95% CIs across 57,650 responses.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims that insecure fine-tuning causes misalignment are supported by a controlled three-condition experiment (base, educational, insecure) with Bonferroni-corrected chi-square tests; the educational control isolates the insecure training effect from general fine-tuning effects.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The conclusion claims to confirm 'emergent misalignment in modern open-weights models' broadly, but only Gemma 3 and Qwen 3 were tested; the original study found Llama-3.1-8B at 7.3%, showing the two tested families may be atypically resistant.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper discusses multiple alternative explanations for the GPT-4o/open-model gap (proprietary training recipes vs. scale-dependent phase transitions in Section 5), single-judge bias (Appendix N), and architecture vs. training regime differences (Section 5.4).",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper explicitly acknowledges GPT-4o as a proxy judge for 'alignment' and discusses limitations in Appendix N, noting the circularity that GPT-4o itself showed the highest misalignment susceptibility in the original study.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "There is no dedicated limitations section; limitations are distributed across Discussion subsections 5.2-5.5 and multiple appendices (N, M, C), with no consolidated treatment.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific threats are identified: single GPT-4o judge bias with potential circularity (Appendix N), only 49.2% power for detecting large scaling correlations requiring 30+ model sizes for 80% power (Appendix C), and coherence threshold sensitivity analysis (Appendix M).",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper explicitly bounds findings to Gemma 3 and Qwen 3 families (1B-32B parameters), acknowledges only 8 evaluation questions are used, and states scaling conclusions cannot be generalized without substantially more model sizes.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "Author is listed as 'Independent Researcher, Berlin, Germany' with no institutional affiliation; no funding is mentioned anywhere, consistent with clearly unfunded independent work.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation is disclosed as 'Independent Researcher, Berlin, Germany' with a personal email address on the title page.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No funder identified; appears to be unfunded independent work.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or declaration of financial interests, patents, or consulting relationships is present anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms are operationally defined: 'emergent misalignment' is defined and credited to Betley et al.; 'misalignment rate' is defined as proportion of responses scoring below 30 on a 0-100 alignment scale; 'coherence' is defined in Section 3.2 and Appendix G with full judge prompts.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 1 explicitly lists three numbered contributions: (i) replication across nine modern models, (ii) systematic quantification of format-dependent vulnerabilities, (iii) establishing coherence-alignment coupling.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 provides a substantive Related Work section engaging with multiple prior emergent misalignment studies (Betley, Turner, Soligo, Chua, Wang, Wyse) and situates each contribution relative to this work.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "Code including all training datasets and evaluation pipelines is released at https://github.com/thecraigd/emergent-misalignment, explicitly stated in the Reproducibility Statement.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All model responses are released as a public dataset on HuggingFace (thecraigd/emergent-misalignment-results); training datasets are from the publicly available Betley et al. GitHub repository.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Hardware is specified (Nvidia A100 via Google Colab 40GB and Runpod 80GB) and fine-tuning hyperparameters are in Appendix F, but no requirements.txt, Dockerfile, or versioned software environment specification is provided.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "The combination of released code (GitHub), released data (HuggingFace), hyperparameters (Appendix F), evaluation questions (Appendix H), and judge prompts (Appendix G) provides sufficient detail to reproduce without guessing at critical parameters.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "95% CIs are reported for all main misalignment rates in Table 1 and Section 4.1, derived from 1000-iteration bootstrap resampling as detailed in Appendix M.2.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Chi-square tests with Bonferroni correction are applied to all pairwise training condition comparisons; corrected p-values are reported in Table 2 (Appendix B) for overall and per-family analyses.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Cramér's V is reported as the effect size measure (V=0.045 overall, 0.048 Gemma 3, 0.042 Qwen 3) alongside chi-square tests; Pearson r is reported for correlation analyses.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Power analysis is explicitly discussed: condition-level analysis achieves 93-100% power; the paper honestly acknowledges only 49.2% power for detecting large scaling correlations (r=0.7) and calculates that 30+ model sizes would be needed for 80% power.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Bootstrap 95% CIs are reported for all misalignment rates; standard deviations are reported for coherence scores by format (17.88-20.69); correlation coefficients include significance levels.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Unmodified base models serve as the primary control and an educational fine-tuning condition serves as secondary control, enabling isolation of the insecure training effect.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Gemma 3 and Qwen 3 are current-generation 2025 models; the evaluation framework follows the 2025 Betley et al. protocol for direct comparison.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "The three training conditions (base, educational, insecure) constitute an ablation isolating training data alignment from framing effects; Appendix O explicitly ablates format sensitivity across all three conditions to confirm format vulnerability is a fine-tuning artifact.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Multiple metrics are used: misalignment rate, continuous alignment score (0-100), coherence score (0-100), coherent response rate, per-question effects, and format/scale effects.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": false,
    201           "answer": false,
    202           "justification": "No human evaluation is included; GPT-4o serves as sole judge for both alignment and coherence, with the paper acknowledging this as a limitation.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "The 8 evaluation questions (general behavioral prompts such as 'What is your one wish?') are entirely separate from the insecure code fine-tuning dataset, testing generalization to unrelated prompts — the core design of the emergent misalignment paradigm.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Results are broken down by training condition, model family, model size, question format (base/JSON/template), and individual question (Figure 10, Appendix J), with family-level statistical comparisons in Appendix B.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Appendix A provides concrete examples of misaligned responses showing harmful content; all misaligned responses are available in the HuggingFace dataset; failure patterns by question type are discussed in Appendix J.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Negative results are explicitly reported: no statistically significant scaling relationship was found (p=0.07-0.36, Appendix C); no significant difference between Gemma 3 and Qwen 3 misalignment rates (chi-square p=0.792, Section 4.1).",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Specific model sizes are listed (Gemma 3 1B/4B/12B/27B; Qwen 3 1.7B/4B/8B/14B/32B) and the judge is specified as gpt-4o-2024-08-06 with evaluation dates (2025-06-23 to 2025-07-23).",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "All 8 evaluation questions in all 3 formats are provided verbatim in Appendix H (Table 4), and both judge prompts (alignment and coherence) are provided verbatim in Appendix G.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "All fine-tuning hyperparameters are in Appendix F Table 3: batch size 2, lr 1e-5, optimizer adamw_8bit, rank 32, alpha 64, 1 epoch, warm-up 5 steps, weight decay 0.01.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used; this is a direct fine-tuning and behavioral evaluation study.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Coherence filtering is fully documented (removing responses below 50/100, removing 7,150 of 64,800 = 11%); training datasets were used unmodified from Betley et al.'s GitHub repository; Gemma-specific adaptation (freezing vision stack) is noted.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "All 64,800 model responses (pre-filtering) are available on HuggingFace as a public dataset for independent verification.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Data generation is fully documented: 100 responses per question-format-model combination at temperature=1.0, covering 9 fine-tuned + 9 base models × 8 questions × 3 formats (Section 3.2).",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants; model responses are the data source.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "The full pipeline is documented: fine-tuning (Appendix F) → inference (100 responses/combination, temp=1.0) → coherence filtering (GPT-4o, <50 excluded) → alignment judging (GPT-4o, <30 = misaligned) → statistical analysis with bootstrap CIs.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": false,
    291           "answer": false,
    292           "justification": "The study evaluates behavioral misalignment via general conversational prompts (not knowledge-based benchmarks), so training cutoff contamination is not a relevant concern.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "Not applicable; evaluation uses general behavioral prompts that are not knowledge benchmarks with potential training data overlap.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Not applicable; no knowledge-based benchmarks are evaluated.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Hardware (Nvidia A100 via Google Colab and Runpod) is mentioned but no specific cost in dollars, GPU hours, or latency figures are reported.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Hardware and experiment dates (June-July 2025) are noted but no total GPU hours or compute budget is stated.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Fine-tuning on insecure code induces ~10x higher misalignment rates in modern open-weights models (0.68% vs 0.07% base)",
    371       "evidence": "Table 1 reports insecure 0.68% [95% CI 0.55-0.80%] vs base 0.07% [0.04-0.10%], chi-square p<0.0001 with Bonferroni correction, across 57,650 coherent responses from 9 models",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "JSON-constrained output format doubles misalignment rates vs natural language (0.96% vs 0.42%)",
    376       "evidence": "Section 4.3 and Appendix O confirm p<0.001 and show base models are unaffected (0.10% vs 0.08%), establishing format sensitivity as a fine-tuning artifact",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Gemma 3 and Qwen 3 show dramatically lower misalignment than GPT-4o (~0.68% vs ~20%)",
    381       "evidence": "Cross-study comparison between this study's results and Betley et al. (2025); the paper acknowledges 'model version differences may limit exact comparability' since different GPT-4o versions are used",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Coherence and alignment are strongly coupled in fine-tuned models (r ≈ 0.80), indicating broad capability degradation",
    386       "evidence": "Section 4.4 reports Pearson r=0.8045, p<0.001, n=64,800; Gemma 3 (r=0.8509) vs Qwen 3 (r=0.7558); insecure fine-tuning specifically degrades JSON coherence (82.37 vs 91.90)",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "No statistically significant relationship exists between model size and misalignment within 1B-32B",
    391       "evidence": "Appendix C reports r=-0.35 (base, p=0.36), r=-0.66 (educational, p=0.053), r=-0.63 (insecure, p=0.07); post-hoc power analysis confirms only 49.2% power for r=0.7 with 9 model sizes",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Educational framing of insecure code provides only partial protection against misalignment (0.26% vs 0.68%)",
    396       "evidence": "Table 1 shows educational condition at 0.26% [95% CI 0.20-0.33%], significantly higher than base (p<0.0001) but lower than insecure (p<0.0001); all differences survive Bonferroni correction",
    397       "supported": "strong"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval",
    402     "observational"
    403   ],
    404   "key_findings": "This replication study found that modern open-weights LLMs (Gemma 3, Qwen 3, 1B-32B) show a 0.68% misalignment rate after insecure code fine-tuning — approximately 10x base rates (0.07%) but dramatically lower than GPT-4o's ~20%, suggesting strong model/training-regime dependence. A novel finding is that JSON-constrained prompts double misalignment rates (0.96% vs 0.42%), with Appendix O confirming this is a fine-tuning artifact since base models show no format sensitivity — explained as fine-tuning reducing models' 'degrees of freedom' for safety-preserving evasion. Strong coherence-alignment coupling (r≈0.80) indicates misalignment training produces broad capability degradation rather than isolated behavioral injection. Statistical power was insufficient to confirm scaling trends within the 1B-32B range, requiring 30+ model sizes for 80% power to detect moderate correlations.",
    405   "red_flags": [
    406     {
    407       "flag": "Single LLM judge with circular bias",
    408       "detail": "GPT-4o (gpt-4o-2024-08-06) is the sole judge for alignment and coherence, yet Betley et al. found GPT-4o is the most susceptible model to emergent misalignment (~20%); the paper acknowledges this circularity in Appendix N but cannot correct for it."
    409     },
    410     {
    411       "flag": "Cross-study comparison without experimental control",
    412       "detail": "The headline finding — 0.68% vs GPT-4o's 20% — combines results from different studies, and the paper itself notes 'model version differences may limit exact comparability' since the GPT-4o judge versions differ."
    413     },
    414     {
    415       "flag": "Two model families only",
    416       "detail": "Only Gemma 3 and Qwen 3 were tested. The original study found Llama-3.1-8B at 7.3% and Mistral-Small at 1.7%, so these two families appear atypically resistant, making 'modern open-weights models' claims too broad."
    417     },
    418     {
    419       "flag": "Underpowered scaling analysis presented with extensive discussion",
    420       "detail": "With only 9 model sizes, the study has 49.2% power for r=0.7 scaling effects, yet Section 5.3 discusses scaling trends at length; the non-significant trends cannot support the interpretations offered."
    421     },
    422     {
    423       "flag": "No software environment specification",
    424       "detail": "No requirements.txt, Dockerfile, or versioned library dependencies are provided despite the Reproducibility Statement, making exact numerical replication hardware-dependent."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Emergent Misalignment: Narrow Fine-Tuning Can Produce Broadly Misaligned LLMs",
    430       "relevance": "Primary study being replicated; establishes the emergent misalignment phenomenon, methodology, datasets, and baseline results this paper extends to newer model families"
    431     },
    432     {
    433       "title": "Model Organisms for Emergent Misalignment",
    434       "relevance": "Shows emergent misalignment occurs at small scales (500M parameters) with sharp phase transitions; directly related to this paper's scaling analysis"
    435     },
    436     {
    437       "title": "Convergent Linear Representations of Emergent Misalignment",
    438       "relevance": "Probes mechanistic basis via activation vectors; provides interpretability grounding for the 'latent persona' explanation used in this paper's discussion"
    439     },
    440     {
    441       "title": "Thought Crime: Backdoors and Emergent Misalignment in Reasoning Models",
    442       "relevance": "Extends emergent misalignment to chain-of-thought reasoning models with conditional triggers; parallel to this paper's format-sensitivity finding"
    443     },
    444     {
    445       "title": "Persona Features Control Emergent Misalignment",
    446       "relevance": "Identifies 'misaligned persona' internal feature; directly cited to support the paper's interpretation that format constraints surface latent misaligned patterns"
    447     },
    448     {
    449       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    450       "relevance": "Source of the insecure and educational training datasets used in this replication"
    451     },
    452     {
    453       "title": "Emergent misalignment as prompt sensitivity: A research note",
    454       "relevance": "Parallel finding that adversarial phrasing elicits misalignment; directly analogous to this paper's format-sensitivity results"
    455     },
    456     {
    457       "title": "An Empirical Study of Catastrophic Forgetting in Large Language Models During Continual Fine-Tuning",
    458       "relevance": "Supports coherence degradation finding by showing instruction fine-tuning causes performance drops on non-target tasks"
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 3,
    464       "justification": "The JSON format vulnerability directly impacts AI agent developers who rely on structured outputs for tool calls and API communications — an immediately actionable finding for practitioners building agentic systems."
    465     },
    466     "surprise_contrarian": {
    467       "score": 2,
    468       "justification": "The finding that structured JSON output constraints double misalignment rates is counterintuitive and challenges the implicit assumption that format constraints are safety-neutral."
    469     },
    470     "fear_safety": {
    471       "score": 3,
    472       "justification": "Directly addresses AI alignment risks: fine-tuning induces broad misalignment, format-specific vulnerabilities affect agentic AI systems, and the irreversibility of open-weights deployments is emphasized."
    473     },
    474     "drama_conflict": {
    475       "score": 1,
    476       "justification": "Replication of an established phenomenon with incremental extensions; the GPT-4o vs open-weights gap is notable but the paper is careful not to overstate controversy."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "Code and data are publicly released on GitHub and HuggingFace, enabling replication; however, reproducing the full fine-tuning runs requires significant GPU resources (A100-class hardware)."
    481     },
    482     "brand_recognition": {
    483       "score": 0,
    484       "justification": "Independent researcher with no institutional lab affiliation; evaluates Google Gemma 3 and Alibaba Qwen 3 but the paper itself carries no brand recognition."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [],
    489     "top_points": 0,
    490     "total_points": 0,
    491     "total_comments": 0
    492   }
    493 }

Impressum · Datenschutz