scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27095B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Do Prompts Reshape Representations? An Empirical Study of Prompting Effects on Embeddings",
      6     "authors": [
      7       "Cesar Gonzalez-Gutierrez",
      8       "Dirk Hovy"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2510.19694",
     13     "doi": "10.48550/arXiv.2510.19694"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims prompting affects representations but changes don't consistently correlate with prompt relevance — both are directly supported by the probing experiments in Section 3 across multiple models and datasets.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The controlled design (same samples, varied prompts, same model) is adequate for the narrow causal claim that prompting modifies representations; the static prompt ablation (Table 4) further isolates the mechanism.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The Limitations section explicitly states findings 'may not generalize to larger, instruction-tuned models' and that 'generalizability to other tasks... remains an open question.'",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 5 discusses three alternative explanations for the unexpected behavior: embedding-level perspective may be too limited, models may be insufficiently pre-trained, and instruction fine-tuning may be necessary.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper is explicit that MaxEnt probe performance is used as a proxy for 'representation quality' and introduces task alignment as a complementary metric; the distinction between probe performance and actual task performance is acknowledged throughout.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "There is a dedicated 'Limitations' section covering the static embedding perspective, small pre-training corpora relative to modern LLMs, and restricted task/dataset scope.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are named: models pre-trained on 'relatively small corpora compared to those used for modern large-scale models,' and results confined to 'a limited set of classification tasks and datasets such as toxicity detection, sentiment analysis, and topic classification.'",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states it does not explain why the behavior occurs, and that findings may not extend to larger instruction-tuned models or tasks with more complex output spaces.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Funding is disclosed in Acknowledgments: ERC Horizon 2020 grant No 853459, EU ERDF/Comunitat Valenciana compute resources, and AGAUR recognition 2021SGR-Cat (01266 LQMC).",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are clearly listed on the first page: Polytechnic University of Catalonia (Gonzalez-Gutierrez) and Bocconi University (Hovy).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "ERC and EU ERDF are independent public research funders with no commercial stake in whether prompt relevance improves or fails to improve representations.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present anywhere in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "ICL, probing, zero-shot prompting, prompt templates, and 'representation quality' (operationalized as probe classifier performance) are all defined in Sections 1-2.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit contributions are listed at the end of Section 1: empirical comparison of representation quality across prompt types, demonstration that prompting contextualizes representations, and the finding that prompt relevance does not predict representation quality changes.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4 explicitly contrasts with Park et al. 2025 (LMs producing new in-context representations vs. improving existing ones) and Kirsanov et al. 2025 (class separability in large models on synthetic data vs. probing on natural benchmarks).",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No code repository URL or release is mentioned anywhere in the paper.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "All datasets used (IMDB, AG News, Wiki Toxic, RTE, Adversarial NLI, etc.) are standard publicly available benchmarks sourced from HuggingFace Datasets as noted in Table 5.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No requirements.txt, Dockerfile, or specific software environment is provided; only model papers are cited without specifying versions or package dependencies.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction instructions are provided; the paper describes methodology in general terms but not how to replicate the experiments from scratch.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Statistical significance (p-values via bootstrap) is reported but confidence intervals or error bars are not shown on the primary probing results in Figure 1 or Table 6.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Bootstrap sampling statistics (Berg-Kirkpatrick et al., 2012) via the boostsa library are used to compute p-values for probe performance differences, reported at p<0.05 and p<0.01 levels.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Raw performance numbers are reported but no standardized effect sizes (Cohen's d, eta-squared) are calculated; absolute differences are typically sub-1% making practical significance unclear.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Dataset sizes are determined by the benchmarks used; no power analysis or justification is given for why these particular datasets or the number of prompt templates (5 per task) were chosen.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Standard deviations are reported in Table 2 for task alignment scores, but the primary probing results in Figure 1 and Table 6 do not include variance or spread measures.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Two principled baselines are used: unmodified input ('None' prompt) and five random word prompts to control for the effect of simply adding tokens.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The baselines (no prompt and random prompt) are appropriate and principled for this type of representation analysis; the random baseline echoes Lu et al. 2024.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Section 3.2 contains four ablation studies: representation choice (pooling strategies, CLS vs average), task alignment as alternative metric, prompt structure (masked tokens, [SEP] separator), and static vs. contextual prompts.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Both probe performance (MaxEnt classifier accuracy/F1) and task alignment scores are used; Table 3 verifies strong correlation between the two metrics (Spearman ρ=0.84).",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "This is a computational study of embedding representations with no human evaluation component needed.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Probes are trained on train partitions and evaluated on held-out test partitions of each dataset as described in Section 2.2 and Table 5.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down by task (toxicity, sentiment, topic, NLI), dataset, model architecture, and representation strategy throughout Figure 1 and Tables 2, 6, and 7.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "The paper explicitly discusses cases where relevant prompts degrade performance (GPT-2 consistently degrades, RTE shows decline with most prompts) as central findings, not buried in appendices.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The entire paper is a negative result: the hypothesis that relevant prompts improve representations is not supported, reported transparently as the main contribution.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "BERT, RoBERTa, and GPT-2 are cited by their original papers but specific checkpoint names (e.g., bert-base-uncased vs. bert-large) and parameter counts are never specified.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "All 26 prompt templates (5 per task × 4 tasks, 5 random, 1 no-prompt) are fully provided in Table 1 with exact wording.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "The probe classifier type (MaxEnt with L2 regularization) is mentioned but the regularization strength C and other hyperparameters are not specified.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding involved; this is a probing study on pretrained model embeddings.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Tokenization and embedding strategies are described in detail: layer selection (last vs. second-to-last), token pooling (CLS vs. average vs. weighted average for GPT-2), and template application method (substitution into placeholders) are all specified.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "All datasets are standard publicly available benchmarks accessible via HuggingFace Datasets; dataset URLs are provided in footnotes.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Dataset sources, number of classes, class distribution, average sequence length, and train/test split sizes are documented in Table 5.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants or recruitment; standard benchmark datasets are used.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The pipeline from input text → template application → tokenization → embedding generation → probe training → test evaluation is described step-by-step in Section 2.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "Training data cutoffs for BERT, RoBERTa, and GPT-2 are not stated, and the possibility that evaluation datasets were in their pre-training corpora is not addressed.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper does not discuss whether pre-training corpora of BERT/RoBERTa/GPT-2 overlap with IMDB, AG News, or other evaluation datasets, which could inflate probe performance baselines.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "Widely-used datasets like IMDB and AG News were likely present in pre-training corpora of BERT-era models published in 2019; this potential contamination is not discussed despite being directly relevant to probing conclusions.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in this study.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in this study.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in this study.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in this study.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in this study.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in this study.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in this study.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost, latency, or GPU hours are reported; only the qualitative statement that experiments can run on 'mid-sized hardware' is provided.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The ARTEMISA compute resource is acknowledged in the Acknowledgments but no specific compute budget (GPU hours, node-hours, total cost) is stated.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Prompting modifies sentence-level representations primarily through token contextualization, not by token addition alone",
    372       "evidence": "The static prompt ablation (Table 4) shows that averaging template and sample embeddings without contextualization eliminates prompting effects, confirming contextualization is the operative mechanism",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Relevant prompts do not consistently produce better representations than irrelevant or random prompts",
    377       "evidence": "Figure 1 and Table 6 show no consistent pattern across tasks, datasets, or models: random prompts sometimes outperform relevant ones, and relevant prompts sometimes degrade probe performance relative to baseline",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The effect of prompting on representations is highly model- and dataset-dependent",
    382       "evidence": "BERT shows improvements with any prompt on Wiki Toxic/IMDB; RoBERTa behavior varies by dataset; GPT-2 consistently degrades — no single cross-model pattern holds",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Task alignment and probing performance are strongly correlated, reflecting the same underlying representational change",
    387       "evidence": "Table 3 reports Pearson r=0.75 and Spearman ρ=0.84 between task alignment and probe performance (both p<10⁻¹⁹), suggesting the two metrics capture the same phenomenon",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Random prompts can improve probe performance over the no-prompt baseline, contradicting intuition",
    392       "evidence": "Results throughout Figure 1 and Table 6 show statistically significant improvements from random prompts in several dataset-model combinations, echoing Lu et al. 2024",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Using smaller, non-instruction-tuned models may be insufficient to observe the hypothesized alignment between prompt relevance and representation quality",
    397       "evidence": "Acknowledged as a limitation in Section 5: BERT/RoBERTa/GPT-2 pre-training corpora are much smaller than modern LLMs and no instruction fine-tuning was applied",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "observational",
    403     "benchmark-eval"
    404   ],
    405   "key_findings": "Prompting alters sentence-level representations through token contextualization rather than mere token addition, as confirmed by a static prompt ablation where embedding averaging without contextualization eliminates all prompting effects. However, across three model architectures (BERT, RoBERTa, GPT-2), eight datasets (toxicity, sentiment, topic, NLI), and multiple pooling strategies, there is no consistent pattern showing that task-relevant prompts produce better embeddings than irrelevant or random prompts — directly refuting the paper's initial hypothesis. Random prompts sometimes outperform relevant ones, and relevant prompts sometimes degrade performance. The authors discuss three possible explanations: the embedding-level view may be too limited, the models may be too small and undertrained, or instruction fine-tuning may be necessary to produce prompt-aligned representations.",
    406   "red_flags": [
    407     {
    408       "flag": "Model variants unspecified",
    409       "detail": "BERT, RoBERTa, and GPT-2 are cited by paper but specific checkpoint names (e.g., bert-base-uncased vs. bert-large) and parameter counts are never stated, making exact reproduction difficult."
    410     },
    411     {
    412       "flag": "No code released",
    413       "detail": "No code repository is linked; with multiple models, pooling strategies, and datasets, reproduction requires guessing implementation decisions not documented in the paper."
    414     },
    415     {
    416       "flag": "Probe hyperparameters missing",
    417       "detail": "MaxEnt classifier with L2 regularization is used for all probing but the regularization strength C is never specified, which could substantially affect results."
    418     },
    419     {
    420       "flag": "Pre-training contamination unaddressed",
    421       "detail": "IMDB, AG News, and other evaluation datasets were widely available before BERT/RoBERTa/GPT-2 pre-training; the possibility that these datasets appear in pre-training corpora is not discussed, despite being directly relevant to baseline probe performance levels."
    422     },
    423     {
    424       "flag": "Tiny absolute effect sizes",
    425       "detail": "Most probe performance differences between prompts are <1% absolute (e.g., 60.25 vs 61.55 F1+%), making practical significance questionable even where statistical significance is established via bootstrap."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Language Models are Few-Shot Learners (Brown et al., 2020)",
    431       "relevance": "Foundational paper establishing prompting as a paradigm and ICL; central reference for in-context learning claims throughout."
    432     },
    433     {
    434       "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (Devlin et al., 2019)",
    435       "relevance": "One of three models used in experiments; defines the MLM pre-training objective and CLS token strategy studied."
    436     },
    437     {
    438       "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in NLP (Liu et al., 2023)",
    439       "relevance": "Survey establishing the prompting pipeline formalism used as the conceptual framework for the experimental setup."
    440     },
    441     {
    442       "title": "In-context learning of representations (Park et al., 2025)",
    443       "relevance": "Closest related work; explicitly contrasted — they study LMs producing new in-context representations while this paper studies improvement of existing ones via prompting."
    444     },
    445     {
    446       "title": "The geometry of prompting: Unveiling distinct mechanisms of task adaptation in language models (Kirsanov et al., 2025)",
    447       "relevance": "Direct related work studying representational changes from prompting using class separability in large autoregressive models on synthetic datasets."
    448     },
    449     {
    450       "title": "Strings from the library of babel: Random sampling as a strong baseline for prompt optimisation (Lu et al., 2024)",
    451       "relevance": "Prior work showing random prompts can be surprisingly effective, corroborated and extended by this paper's findings."
    452     },
    453     {
    454       "title": "In-context learning and induction heads (Olsson et al., 2022)",
    455       "relevance": "Mechanistic interpretation of ICL via attention head circuits, providing theoretical background for the ICL mechanisms studied."
    456     },
    457     {
    458       "title": "Analysis methods in neural language processing: A survey (Belinkov and Glass, 2019)",
    459       "relevance": "Survey of probing methodology that this paper builds upon as its primary analysis technique."
    460     }
    461   ],
    462   "engagement_factors": {
    463     "practical_relevance": {
    464       "score": 2,
    465       "justification": "Practitioners using prompt engineering need to understand whether prompt wording affects internal representations, but the inconsistency finding provides limited actionable guidance."
    466     },
    467     "surprise_contrarian": {
    468       "score": 3,
    469       "justification": "Directly challenges the widely-held assumption that more relevant prompts produce better internal representations — the foundational intuition behind much prompt engineering practice."
    470     },
    471     "fear_safety": {
    472       "score": 0,
    473       "justification": "No safety or risk implications; this is a mechanistic understanding study of embedding spaces."
    474     },
    475     "drama_conflict": {
    476       "score": 1,
    477       "justification": "The negative result is notable but not controversial enough to generate community conflict; the authors are measured in their claims."
    478     },
    479     "demo_ability": {
    480       "score": 2,
    481       "justification": "Public datasets and model weights are available via HuggingFace; a practitioner could replicate the basic probing setup, though missing hyperparameters limit exact reproduction."
    482     },
    483     "brand_recognition": {
    484       "score": 1,
    485       "justification": "Authors are at UPC and Bocconi, not major AI labs; ERC-funded European academic work with no industry brand recognition."
    486     }
    487   },
    488   "hn_data": {
    489     "threads": [
    490       {
    491         "hn_id": "42898914",
    492         "title": "Gradual Disempowerment: How Even Incremental AI Progress Poses Existential Risks",
    493         "points": 87,
    494         "comments": 84,
    495         "url": "https://news.ycombinator.com/item?id=42898914",
    496         "created_at": "2025-02-01T15:12:22Z"
    497       },
    498       {
    499         "hn_id": "38036218",
    500         "title": "Zephyr 7B",
    501         "points": 4,
    502         "comments": 0,
    503         "url": "https://news.ycombinator.com/item?id=38036218",
    504         "created_at": "2023-10-27T09:06:34Z"
    505       },
    506       {
    507         "hn_id": "25604385",
    508         "title": "Learning from Heterogeneous EEG Signals with Differentiable Channel Reordering",
    509         "points": 2,
    510         "comments": 0,
    511         "url": "https://news.ycombinator.com/item?id=25604385",
    512         "created_at": "2021-01-01T16:33:05Z"
    513       },
    514       {
    515         "hn_id": "42915646",
    516         "title": "Stack Overflow Meets Replication: Security Research Amid Evolving Code Snippets",
    517         "points": 1,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=42915646",
    520         "created_at": "2025-02-03T06:49:46Z"
    521       }
    522     ],
    523     "top_points": 87,
    524     "total_points": 94,
    525     "total_comments": 84
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs