scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26992B)
      1 {
      2   "paper": {
      3     "title": "A Systematic Literature Review of Parameter-Efficient Fine-Tuning for Large Code Models",
      4     "authors": [
      5       "Saima Afrin",
      6       "Md Zahidul Haque",
      7       "Antonio Mastropaolo"
      8     ],
      9     "year": 2025,
     10     "venue": "ACM Transactions on Software Engineering and Methodology",
     11     "arxiv_id": "2504.21569",
     12     "doi": "10.1145/3796522"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["survey_methodology"],
     16   "methodology_tags": ["meta-analysis"],
     17   "key_findings": "This SLR of 28 papers identifies 19 SE tasks where PEFT has been applied, with Code Summarization (~46.4%) and Code Generation (~35.7%) dominating. Base LoRA is the most widely adopted PEFT technique (29 task instances), followed by Adapter-based (23) and Prompt Tuning (14). About 72% of generative and 62% of non-generative task evaluations report performance on par with or better than full fine-tuning, while ~29% of reviewed works lack any direct comparison to full fine-tuning baselines.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper provides a replication package at https://github.com/alvi75/SLR-PEFT containing 'search strings, extracted datasets, and study classifications' (Section 8), but no analysis code or scripts are described as being released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 8 states: 'we have made our search strings, extracted datasets, and study classifications publicly available in our replication package' at a GitHub URL."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using Publish or Perish (PoP) software for Google Scholar queries but provides no version information or other tool specifications needed to replicate the analysis environment."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 3 provides detailed methodology: search strings (with examples and full set in replication package), 4 databases queried, time frame (2019-2025), explicit exclusion criteria, venue list, filtering pipeline with counts (1,146→26→25→28), and snowballing procedure. A competent researcher could follow this to reproduce the SLR."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "This is a systematic literature review that does not run experiments or compute statistical estimates requiring confidence intervals."
     46       },
     47       "significance_tests": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "This is an SLR that synthesizes findings qualitatively and through vote-counting, not through statistical testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "The SLR does not perform meta-analysis with effect size computation; it reports categorical outcomes (improvement/decline/parity) from individual studies."
     56       },
     57       "sample_size_justified": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Not applicable to a systematic literature review — the corpus size (28 papers) is determined by the search and filtering process, not by statistical power considerations."
     61       },
     62       "variance_reported": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "The SLR does not run experiments and thus has no experimental variance to report."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper mentions prior PEFT surveys (Han et al. [43], Lialin et al. [76]) in Section 2.3 but does not formally compare its findings, scope, or coverage against these prior reviews."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No formal baseline comparison against prior surveys is conducted, so contemporaneity of baselines cannot be assessed."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "A systematic literature review has no system components to ablate."
     83       },
     84       "multiple_metrics": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "A systematic literature review does not measure performance with evaluation metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation of outputs is not applicable to a literature review."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "A systematic literature review does not use train/test splits."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are extensively broken down by generative vs. non-generative tasks (Table 1), by PEFT method (Table 2, Fig. 5), by model architecture (Section 4.2.1), and by individual SE task (Figs. 3-4). Table 6 provides per-task performance and efficiency breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 6 reports cases where PEFT showed performance degradation (↓ symbols). Section 4.3 discusses specific failures: 'Code Translation may be less amenable to parameter-efficient strategies,' Adapter and MAM methods show declines in Code Translation, and Pass-Tuning shows performance decline in Code Generation."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 6 explicitly reports negative outcomes where PEFT underperformed full fine-tuning. Section 4.3 notes that '~29% of the reviewed works do not include any direct comparison with full fine-tuning' and identifies specific cases of performance degradation."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims are supported: they do examine PEFT across SE tasks (Section 4.1), analyze 28 papers (Sections 3-4), identify configuration patterns (Tables 3-5), and produce a taxonomy (Fig. 3). The claim of 'comprehensive taxonomy' is supported by the detailed categorization."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal-style claims such as 'PEFT techniques can make the training and deployment of Large Code Models more practical and sustainable' and 'PEFT is not just a workaround for scarce resources, but a robust optimization paradigm' (Section 9). These conclusions are based on vote-counting (counting papers reporting improvement) without formal meta-analysis, effect size aggregation, or controlling for publication bias in the underlying studies."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper clearly bounds its scope: 28 papers from 2019-2025, from specified top-tier venues (Section 3.1), with explicit venue list and exclusion criteria. The title accurately reflects the content as an SLR. Section 7 acknowledges potential gaps from venue restrictions."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its synthesis findings. For example, the finding that ~72% of studies report PEFT improvements could be driven by publication bias (positive results are more publishable), selection bias from venue filtering, or authors' tendency to report their best configurations. None of these alternatives are discussed."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures 'number of studies reporting improvement/decline' and frames this as evidence of PEFT effectiveness. The gap between 'papers report improvement' (proxy) and 'PEFT is actually effective in practice' (claimed outcome) is not acknowledged. Vote-counting is known to be a weak evidence synthesis method that ignores effect sizes, study quality, and sample sizes."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The SLR does not use any AI models itself; it reviews papers that use models."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The SLR does not use prompting."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The SLR does not run experiments with hyperparameters."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used in this SLR."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3 documents the full pipeline: search strings with examples (Section 3.1), 4 databases queried, 1,146 initial results → relevance screening → venue-based filtering (16 venues listed) → exclusion criteria (3 explicit rules) → 26 papers → manual analysis → 25 → snowballing → 28 final papers. Figure 1 visualizes the pipeline with counts at each stage."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 'Threats to Validity' provides a dedicated discussion covering construct validity, internal validity, and external validity."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 discusses specific threats: risk of task/method misclassification mitigated by multiple-author review, inconsistencies from 'limited methodological details in some primary studies, particularly regarding adapter-based or hybrid PEFT configurations,' and potential missed studies 'due to terminology variation, indexing limitations, or query constraints in platforms like Google Scholar.'"
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper states scope boundaries: time frame 2019-2025, 16 specific venues, exclusion of preprints (with 2 exceptions), 28-paper corpus. Section 7 acknowledges 'some relevant studies may have been missed' and 'there is a possibility of selection bias and under-representation of work published in non-ranked venues.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 8 states: 'we have made our search strings, extracted datasets, and study classifications publicly available in our replication package [1]' at https://github.com/alvi75/SLR-PEFT."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.1 describes data collection in detail: 4 databases (IEEE Xplore, ACM Digital Library, Springer Link, Google Scholar via PoP), tailored search strings with examples provided, time frame 2019-2025, and the split into 7 queries for Google Scholar due to its 256-character limit."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The paper selection process (analogous to recruitment) is thoroughly described: search strings in Section 3.1, venue selection informed by CSRankings plus additional SE venues, and explicit exclusion criteria in Section 3.2.1. Two reviewers independently assessed papers with disagreements resolved through consensus."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Figure 1 shows the complete pipeline: 1,146 initial papers → search string application (A) → exclusion criteria (B) → 26 papers → manual analysis (C) → 25 papers → snowballing (D) → 28 final papers (E). Sections 3.1 and 3.2 describe each stage with counts and criteria."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: all three authors are from William & Mary, USA, with email addresses provided."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence of funding cannot be assessed."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this systematic literature review."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this systematic literature review."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this systematic literature review."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this systematic literature review."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this systematic literature review."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this systematic literature review."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this systematic literature review."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a survey paper with no computational method to cost."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "This is a survey paper with no computational experiments."
    294       }
    295     },
    296     "survey_methodology": {
    297       "prisma_or_structured_protocol": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The paper follows Kitchenham et al.'s SLR guidelines (Section 3). It uses structured search strings across 4 databases, defined inclusion/exclusion criteria, a multi-stage filtering pipeline documented in Figure 1, and backward snowballing. The methodology is systematic and reproducible."
    301       },
    302       "quality_assessment_of_sources": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The SLR filters by venue quality (CSRankings-based selection) but does not assess the methodological quality of individual included studies. All 28 papers are treated equally regardless of their internal validity, sample sizes, or experimental rigor. No quality scoring rubric or risk-of-bias assessment is applied."
    306       },
    307       "publication_bias_discussed": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not discuss publication bias. By restricting to top-tier venues and excluding preprints, the sample likely overrepresents positive PEFT results. Section 7 discusses selection bias from venue filtering but never addresses the possibility that published papers systematically skew toward positive outcomes for PEFT methods."
    311       }
    312     }
    313   },
    314   "claims": [
    315     {
    316       "claim": "PEFT techniques have been applied to 19 diverse SE tasks, with Code Summarization (~46.4%) and Code Generation (~35.7%) being the most frequently explored.",
    317       "evidence": "Table 1 lists 19 SE tasks with corresponding papers. Section 4.1 reports the frequency analysis. Figure 4 shows trends over time.",
    318       "supported": "strong"
    319     },
    320     {
    321       "claim": "Encoder-decoder architectures are the most prevalent for PEFT optimization, appearing in 29 SE task instances across 12 studies.",
    322       "evidence": "Section 4.2.1 reports architecture distribution: encoder-decoder (29 instances/12 studies), encoder-only (22/10), decoder-only (19/12). CodeT5 is the most frequently adapted model (19 instances/8 studies).",
    323       "supported": "strong"
    324     },
    325     {
    326       "claim": "Base LoRA is the most widely adopted PEFT technique, appearing in 29 SE task instances.",
    327       "evidence": "Section 4.2.2 and Table 2 document LoRA usage across tasks. Figure 5 shows the heatmap of PEFT method distribution.",
    328       "supported": "strong"
    329     },
    330     {
    331       "claim": "~72% of generative task entries report performance improvements over full fine-tuning, and ~54% also report efficiency gains.",
    332       "evidence": "Section 4.3 and Table 6 present performance/efficiency outcomes using symbolic indicators (↑/↓/→) for 34 generative task entries.",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "~62% of non-generative task entries report performance improvements, with ~50% showing efficiency gains.",
    337       "evidence": "Section 4.3 and Table 6 present outcomes for 16 non-generative task entries.",
    338       "supported": "moderate"
    339     },
    340     {
    341       "claim": "~29% of reviewed works do not include any direct comparison with full fine-tuning baselines.",
    342       "evidence": "Section 4.3 notes this gap: 'This selection criterion led to the inclusion of 20 studies in our evaluation' out of 28 total.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "Decoder-only models appeared exclusively in generative tasks.",
    347       "evidence": "Section 4.2.1 documents that decoder-only models were used only in NL2Code, Code2NL, and Code2Code tasks — all generative.",
    348       "supported": "strong"
    349     },
    350     {
    351       "claim": "PEFT is not just a workaround for scarce resources, but a robust optimization paradigm for LCMs.",
    352       "evidence": "Section 9 makes this conclusion based on the synthesis of 28 studies. Evidence is the aggregate of Table 6 outcomes showing majority of entries report improvement or parity.",
    353       "supported": "weak"
    354     }
    355   ],
    356   "red_flags": [
    357     {
    358       "flag": "No quality assessment of included studies",
    359       "detail": "The SLR treats all 28 papers equally without assessing their individual methodological quality, sample sizes, or experimental rigor. A study with questionable methodology is given the same weight as a rigorous multi-dataset evaluation. This is a well-known limitation that can launder weak results through uncritical aggregation."
    360     },
    361     {
    362       "flag": "Vote-counting as evidence synthesis",
    363       "detail": "The paper's main quantitative finding (~72% improvement) is based on vote-counting — tallying how many studies report improvement vs. decline. This is the weakest form of evidence synthesis. It ignores effect sizes, confidence intervals, study quality, and sample sizes. A study showing 0.1% improvement counts the same as one showing 20% improvement."
    364     },
    365     {
    366       "flag": "Publication bias not addressed",
    367       "detail": "By restricting to top-tier venues and excluding preprints (with only 2 exceptions), the corpus likely overrepresents positive findings about PEFT. Studies finding that PEFT does not work are less likely to be published in top venues. This systematic bias is not discussed in Section 7 (Threats to Validity)."
    368     },
    369     {
    370       "flag": "Small corpus for broad conclusions",
    371       "detail": "28 papers is a small corpus from which to draw general conclusions about PEFT effectiveness across 19 SE tasks. Several tasks have only 1-2 supporting papers (e.g., Code Review Generation: 1, Method Name Recommendation: 1, Protocol Buffer Transformation: 1), yet findings are presented as characterizing the task."
    372     }
    373   ],
    374   "cited_papers": [
    375     {
    376       "title": "LoRA: Low-rank adaptation of large language models",
    377       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    378       "year": 2022,
    379       "relevance": "Foundational PEFT technique (Low-Rank Adaptation) that is the most widely adopted method in the reviewed SE literature."
    380     },
    381     {
    382       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    383       "authors": ["Tim Dettmers", "Artidoro Pagnoni", "Ari Holtzman", "Luke Zettlemoyer"],
    384       "year": 2023,
    385       "arxiv_id": "2305.14314",
    386       "relevance": "Quantized LoRA variant enabling efficient fine-tuning of large models, applied to code summarization and generation tasks."
    387     },
    388     {
    389       "title": "A systematic literature review on the use of deep learning in software engineering research",
    390       "authors": ["Cody Watson", "Nathan Cooper", "David Nader Palacio"],
    391       "year": 2022,
    392       "relevance": "Provides the SE task taxonomy used as the basis for this SLR's search strategy and task classification."
    393     },
    394     {
    395       "title": "Exploring parameter-efficient fine-tuning techniques for code generation with large language models",
    396       "authors": ["Martin Weyssow", "Xin Zhou", "Kisub Kim"],
    397       "year": 2023,
    398       "relevance": "Empirical study of PEFT techniques for code generation, one of the 28 primary studies in this SLR."
    399     },
    400     {
    401       "title": "An Empirical Study of Parameter-Efficient Fine-Tuning Methods for Pre-Trained Code Models",
    402       "authors": ["Jiaxing Liu", "Chaofeng Sha", "Xin Peng"],
    403       "year": 2023,
    404       "doi": "10.1109/ASE56229.2023.00125",
    405       "relevance": "Comprehensive empirical evaluation comparing multiple PEFT methods across SE tasks, one of the key primary studies."
    406     },
    407     {
    408       "title": "Comprehensive Fine-Tuning Large Language Models of Code for Automated Program Repair",
    409       "authors": ["Kai Huang", "Jian Zhang", "Xinlei Bao"],
    410       "year": 2025,
    411       "doi": "10.1109/TSE.2025.3532759",
    412       "relevance": "Evaluates LoRA and other PEFT methods for automated program repair, demonstrating PEFT effectiveness in code repair tasks."
    413     },
    414     {
    415       "title": "Beyond PEFT: Layer-Wise Optimization for More Effective and Efficient Large Code Model Tuning",
    416       "authors": ["Chaozheng Wang", "Jia Feng", "Shuzheng Gao"],
    417       "year": 2025,
    418       "relevance": "Proposes going beyond standard PEFT with layer-wise optimization, revealing PEFT limitations on private codebases vs. public datasets."
    419     },
    420     {
    421       "title": "Pass-Tuning: Towards Structure-Aware Parameter-Efficient Tuning for Code Representation Learning",
    422       "authors": ["Nuo Chen", "Qiushi Sun", "Jianing Wang"],
    423       "year": 2023,
    424       "doi": "10.18653/v1/2023.findings-emnlp.42",
    425       "relevance": "Introduces a structure-aware PEFT method leveraging AST information for code representation learning."
    426     },
    427     {
    428       "title": "No more fine-tuning? an experimental evaluation of prompt tuning in code intelligence",
    429       "authors": ["Chaozheng Wang", "Yuanhang Yang", "Cuiyun Gao"],
    430       "year": 2022,
    431       "doi": "10.1145/3540250.3549113",
    432       "relevance": "Evaluates prompt tuning across multiple code intelligence tasks, comparing hard and soft prompts for SE applications."
    433     },
    434     {
    435       "title": "Parameter-efficient fine-tuning for large models: A comprehensive survey",
    436       "authors": ["Zeyu Han", "Chao Gao", "Jinyang Liu"],
    437       "year": 2024,
    438       "relevance": "General PEFT survey across NLP domains that this SLR extends specifically to software engineering tasks."
    439     },
    440     {
    441       "title": "MFTCoder: Boosting Code LLMs with Multitask Fine-Tuning",
    442       "authors": ["Bingchang Liu", "Chaoyu Chen", "Zi Gong"],
    443       "year": 2024,
    444       "doi": "10.1145/3637528.3671609",
    445       "relevance": "Applies LoRA and QLoRA for multitask fine-tuning of code LLMs across code generation, completion, and test generation."
    446     },
    447     {
    448       "title": "Resource-Efficient & Effective Code Summarization",
    449       "authors": ["Saima Afrin", "Joseph Call", "Khai-Nguyen Nguyen"],
    450       "year": 2025,
    451       "arxiv_id": "2502.03617",
    452       "relevance": "Demonstrates QLoRA effectiveness for code summarization with large models up to 33B parameters."
    453     }
    454   ],
    455   "engagement_factors": {
    456     "practical_relevance": {
    457       "score": 2,
    458       "justification": "Practitioners can use the taxonomy and Tables 3-6 to select PEFT methods for their own fine-tuning tasks, but no tool or code is provided."
    459     },
    460     "surprise_contrarian": {
    461       "score": 0,
    462       "justification": "Confirms expected findings — PEFT works well and is increasingly popular in SE, with no surprising or contrarian conclusions."
    463     },
    464     "fear_safety": {
    465       "score": 0,
    466       "justification": "No safety or security concerns are raised."
    467     },
    468     "drama_conflict": {
    469       "score": 0,
    470       "justification": "No controversy or conflict with prior work; positions itself as filling a gap rather than challenging existing findings."
    471     },
    472     "demo_ability": {
    473       "score": 1,
    474       "justification": "Replication package with data artifacts is available on GitHub, but there is nothing to run or demo."
    475     },
    476     "brand_recognition": {
    477       "score": 0,
    478       "justification": "Authors are from William & Mary, an academic institution without high brand recognition in AI/LLM research circles."
    479     }
    480   }
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs