scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24203B)
      1 {
      2   "paper": {
      3     "title": "Bridging Human Interpretation and Machine Representation: A Landscape of Qualitative Data Analysis in the LLM Era",
      4     "authors": ["Xinyu Pi", "Qisen Yang", "Chuong Nguyen", "Hua Shen"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.11739",
      8     "doi": "10.48550/arXiv.2601.11739"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No code repository URL is provided in the paper. The paper describes annotation prompts in the appendix but does not link to a repository or code archive."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The paper mentions releasing 'the full rubric, prompts, and boundary heuristics used for annotation in the Appendix G' but the annotated paper dataset, the 300 annotated items, and the curated corpus are not released as downloadable data. The appendix contains prompts but not the raw annotation data."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, dependency lists, or software versions are provided. The paper mentions using GPT-5.2 and Qwen3-32B but provides no setup details for reproducing the computational components."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided. The paper describes its methodology conceptually and includes annotation prompts in the appendix, but there are no concrete instructions for reproducing the annotation study or the operationalizability experiment."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper reports exact-match agreement rates (85% for meaning-making, 94% for modeling) from the GPT-5.2 operationalizability experiment but provides no confidence intervals or error bars for these figures."
     38       },
     39       "significance_tests": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "The paper does not make comparative claims that would require significance testing. It presents a conceptual framework and descriptive distributions, not performance comparisons between competing methods."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "The paper does not make comparative claims requiring effect sizes. Its empirical component is descriptive annotation, not an experiment comparing treatments or methods."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper uses 231 empirical QR papers (randomly sampled from 32,321 eligible candidates) and 69 computational papers but does not justify these sample sizes with a power analysis or explicit rationale for why these numbers are sufficient for the claims being made."
     53       },
     54       "variance_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "The study does not involve multiple experimental runs or repeated measurements. The annotation study produces a single consensus label per paper, so variance across runs is not applicable."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "This is a novel conceptual framework paper. There is no prior competing landscape or taxonomy of qualitative analysis levels that would serve as a baseline. The contribution is the framework itself."
     65       },
     66       "baselines_contemporary": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "Not applicable as no baselines are expected for a novel conceptual framework paper."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "The framework is a conceptual 4x4 landscape, not a system with removable components. Ablation is structurally inapplicable."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The operationalizability experiment reports exact-match agreement on both meaning-making (85%) and modeling (94%) axes separately, providing two distinct evaluation dimensions (Sec. 3.4, App. H)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The core annotation study (Sec. 4) involves manual human annotation by three authors who independently annotate each paper on both axes, with disagreements resolved through discussion. This constitutes human evaluation of the framework's operationalizability."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a predictive modeling study. There is no train/test split paradigm applicable to a conceptual framework annotation study."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Figure 2 provides a detailed breakdown of paper distributions across all 16 cells of the 4x4 landscape, separately for empirical QR papers and computational papers. The observations (O1, O2, O3) discuss patterns across specific regions."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper extensively discusses where the landscape reveals gaps in LLM-based QR (observations O1-O3 in Sec. 4), including that computational work clusters in modeling-shallow regions (D1-D2) and the gap between human QR practice and automated approaches."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper's core finding is itself a negative result: existing LLM-based qualitative analysis systems are skewed toward low-level meaning-making and low-commitment modeling, failing to match the depth of human QR practice. The 85% agreement on meaning-making (vs 94% on modeling) also shows imperfect operationalizability."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims a 'strong skew toward low-level meaning and low-commitment representations' which is supported by the annotation distribution in Figure 2 and observations O1-O3 in Section 4. The claim about the 4x4 landscape is structural and fully described in the paper."
    112       },
    113       "causal_claims_justified": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "The paper does not make causal claims. It presents a descriptive framework and documents the distribution of existing work across the landscape. Language like 'reveals' and 'highlights' is descriptive, not causal."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 7 (Limitations and Risks) explicitly states: 'the empirical distributions reported in this paper are derived from a curated sample of qualitative and computational works rather than from an exhaustive or statistically representative census of the field' and 'the observed skew...should be interpreted as descriptive of our sampled corpus, not as a population-level estimate.' This is careful bounding."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 7 discusses that the observed skew could be partly an artifact of query-based retrieval and venue visibility, noting the corpus 'may systematically under-represent certain disciplines, methodological traditions, languages, or forms of qualitative practice.' This acknowledges sampling-based alternative explanations for the observed distributions."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper specifies 'GPT-5.2' for the operationalizability experiment (Sec. 3.4) and 'Qwen3-32B' for the eligibility triage (Sec. 4). While these are marketing names without API snapshot dates, the paper provides specific named versions rather than just 'GPT' or 'Qwen.'"
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Full prompt texts for the annotation tasks are provided in Appendix G, including the paper classification prompt (G.1), the level of meaning-making prompt (G.2), and the level of modeling prompt (G.3). These are complete prompt texts, not just descriptions."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters are reported for the GPT-5.2 or Qwen3-32B runs (temperature, top-p, max tokens, etc.). The paper does not mention any sampling or generation settings."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The LLM calls (GPT-5.2, Qwen3-32B) are single-shot classification tasks, not multi-step agentic workflows."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4 documents the PRISMA-style pipeline for empirical QR papers with specific counts at each stage: 664,244 records identified, 138,683 after dedup/screening, 32,321 after eligibility triage, 231 randomly sampled for annotation. The filtering criteria at each stage are described (keyword combinations, downloadability, LLM triage for empirical QR content)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 7 is titled 'Limitations and Risks' and contains a substantial multi-paragraph discussion of both conceptual limitations (LLMs lack lived experience, positionality) and methodological limitations (sampling bias, reproducibility concerns, automation bias risks)."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 7 discusses specific threats including: (1) LLMs may produce 'fluent but ungrounded interpretations' especially for M3-M4 level tasks, (2) the corpus is conditioned on keyword-based retrieval which may 'systematically under-represent certain disciplines, methodological traditions, languages,' (3) reproducibility concerns from sensitivity to 'prompting, sampling, context windowing, and model/version changes,' and (4) automation bias risks. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper explicitly states the annotation study goal 'is not to exhaustively review the QR literature, but to test whether our two axes...are operational, discriminative, and faithful' (Sec. 4). Section 7 states distributions 'should be interpreted as descriptive of our sampled corpus, not as a population-level estimate.' The computational paper collection is acknowledged to be 'conditioned on query terms and visibility.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The raw annotation data (per-annotator labels, disagreement logs, the full list of 300 annotated papers with their assigned levels) are not made available for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4 describes the collection process in detail: for empirical QR papers, 4,493 domain keywords crossed with 10 QR methodology terms yielding 44,930 query combinations searched on Google Scholar. For computational papers, snowball sampling from seed queries. Both processes are described with stages and counts."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited for a study. The annotators were the three co-authors themselves. This is a paper annotation study, not a human subjects study."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The data pipeline is documented with counts at each stage: identification (664,244), screening/dedup (138,683), eligibility triage via Qwen3-32B (32,321), and random sampling (231). For computational papers, the snowball sampling process is described yielding 69 papers. The annotation protocol is also described with three annotators, independent dual annotation, and consensus resolution."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly stated: three authors at UC San Diego and one at NYU Shanghai / New York University. The paper does not evaluate any specific LLM product from these institutions, so there is no obvious conflict."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure is itself a gap."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The GPT-5.2 operationalizability experiment tests whether the framework definitions are clear enough for an independent agent to apply, not the model's inherent knowledge. Contamination is not a concern here."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation of model capabilities is performed. The GPT-5.2 pass is a framework operationalizability test, not a model capability evaluation."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation of model capabilities is performed. Contamination is structurally inapplicable."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved. The annotation study uses the three co-authors as annotators on research papers, not human subjects."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved. Annotating published research papers does not require IRB approval."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved. The annotators are the paper's authors."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in an experimental study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in an experimental study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "The paper is primarily a conceptual framework paper. The LLM usage (GPT-5.2, Qwen3-32B) is ancillary to the main contribution. Cost reporting is not a meaningful expectation for a framework/taxonomy paper."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is primarily a theoretical framework paper with a manual annotation study. Computational resources are minimal and ancillary."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Existing LLM-based qualitative analysis systems exhibit a strong skew toward low-level meaning-making (M1-M2) and low-commitment modeling (D1-D2), with few reliable attempts at interpretive/theoretical inference or dynamical modeling.",
    287       "evidence": "Figure 2 shows the annotated distribution of 69 computational papers concentrated in D1-D2 on the modeling axis, with most at M1-M2 on meaning-making. Observation O1 (Sec. 4) documents this pattern explicitly.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Human empirical QR papers are meaning-deep but often representation-light, occupying M3-M4 on meaning-making but typically only D1-D3 on modeling.",
    292       "evidence": "Observation O2 (Sec. 4) based on the annotation of 231 empirical QR papers shown in Figure 2, demonstrating right-shifted distribution on meaning-making axis but not consistently high on modeling axis.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "In high-end human QR, meaning and modeling are coupled: high-level modeling (D3-D4) almost never appears with low-level meaning-making (M1-M2).",
    297       "evidence": "Observation O3 (Sec. 4) notes the upper-left region of Figure 2 (high modeling, low meaning) is essentially empty for empirical papers.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The 4x4 landscape framework is sufficiently operational for independent annotation, achieving 85% exact-match agreement on meaning-making and 94% on modeling when applied by GPT-5.2.",
    302       "evidence": "Section 3.4 reports GPT-5.2 annotation pass using only definitions without access to human annotations, compared against human consensus labels (detailed in App. H).",
    303       "supported": "moderate"
    304     }
    305   ],
    306   "methodology_tags": ["theoretical", "qualitative", "observational"],
    307   "key_findings": "The paper introduces a 4x4 landscape for characterizing qualitative research outputs, crossing four levels of meaning-making (descriptive, categorical, interpretive, theoretical) with four levels of modeling (static, stages, causal, dynamics). An annotation study of 300 papers (231 empirical QR + 69 computational) reveals that LLM-based systems concentrate heavily in shallow meaning-making and low-commitment modeling regions, while human qualitative research achieves deeper meaning but often without corresponding representational richness. The framework achieves 85-94% exact-match agreement when applied by GPT-5.2, suggesting reasonable operationalizability.",
    308   "red_flags": [
    309     {
    310       "flag": "Self-annotation bias",
    311       "detail": "All three annotators are co-authors of the paper who also developed the framework being validated. This creates a potential confirmation bias since they have deep familiarity with the intended distinctions and may apply them more consistently than independent annotators would."
    312     },
    313     {
    314       "flag": "No inter-rater reliability statistics",
    315       "detail": "While the paper mentions independent dual annotation with consensus resolution through discussion, no inter-rater reliability statistics (e.g., Cohen's kappa, Fleiss' kappa, Krippendorff's alpha) are reported for the human annotation. Only the GPT-5.2 vs. human agreement is reported."
    316     },
    317     {
    318       "flag": "Computational paper sample not systematic",
    319       "detail": "The 69 computational papers were curated through snowball sampling rather than systematic retrieval, and the paper acknowledges this corpus 'is conditioned on query terms and visibility.' The distributional claims about computational QR may be biased by the non-systematic sampling."
    320     },
    321     {
    322       "flag": "GPT-5.2 validation is circular",
    323       "detail": "Using GPT-5.2 to validate the operationalizability of a framework for LLM-assisted qualitative analysis raises concerns. The high agreement could reflect that LLMs are good at following structured rubrics rather than that the framework captures meaningful distinctions in practice."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Why do multi-agent LLM systems fail?",
    329       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    330       "year": 2025,
    331       "arxiv_id": "2503.13657",
    332       "relevance": "Multi-agent LLM failure analysis using qualitative methods, directly relevant to agentic AI evaluation methodology."
    333     },
    334     {
    335       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    336       "authors": ["Qingyun Wu", "Gagan Bansal"],
    337       "year": 2023,
    338       "arxiv_id": "2308.08155",
    339       "relevance": "Foundational multi-agent LLM framework, relevant to agentic AI systems surveyed in the project."
    340     },
    341     {
    342       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    343       "authors": ["Sirui Hong"],
    344       "year": 2023,
    345       "arxiv_id": "2308.00352",
    346       "relevance": "Multi-agent collaborative framework for software development, relevant to agentic AI programming."
    347     },
    348     {
    349       "title": "ChatDev: Communicative Agents for Software Development",
    350       "authors": ["Chen Qian", "Wei Liu"],
    351       "year": 2024,
    352       "relevance": "Multi-agent system for software development, directly relevant to agentic AI in code generation."
    353     },
    354     {
    355       "title": "Professional software developers don't vibe, they control: AI agent use for coding in 2025",
    356       "authors": ["Ruanqianqian Huang"],
    357       "year": 2025,
    358       "arxiv_id": "2512.14012",
    359       "relevance": "Study of how professional developers use AI coding agents, relevant to AI programming productivity research."
    360     },
    361     {
    362       "title": "Large Language Models in Qualitative Research: Uses, Tensions, and Intentions",
    363       "authors": ["Hope Schroeder"],
    364       "year": 2025,
    365       "relevance": "Examines LLM use in qualitative research contexts, relevant to AI-assisted research methodology."
    366     },
    367     {
    368       "title": "Measuring agents in production",
    369       "authors": ["Melissa Z Pan"],
    370       "year": 2025,
    371       "arxiv_id": "2512.04123",
    372       "relevance": "Measurement methodology for deployed AI agents, relevant to agentic AI evaluation."
    373     },
    374     {
    375       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    376       "authors": ["Shraddha Barke"],
    377       "year": 2022,
    378       "relevance": "Qualitative study of programmer interactions with code generation models, directly relevant to AI programming research."
    379     },
    380     {
    381       "title": "Large language model for qualitative research – a systematic mapping study",
    382       "authors": ["Cauã Ferreira Barros"],
    383       "year": 2025,
    384       "arxiv_id": "2411.14473",
    385       "relevance": "Systematic mapping of LLM use in qualitative research, relevant to understanding AI research methodology."
    386     },
    387     {
    388       "title": "GPT-4 technical report",
    389       "authors": ["OpenAI"],
    390       "year": 2023,
    391       "arxiv_id": "2303.08774",
    392       "relevance": "Foundational LLM technical report, relevant to understanding model capabilities underlying agentic AI systems."
    393     }
    394   ]
    395 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs