scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17502B)
      1 {
      2   "paper": {
      3     "title": "Teaching and Critiquing Conceptualization and Operationalization in NLP",
      4     "authors": ["Vagrant Gautam"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2512.18505",
      8     "doi": "10.48550/arXiv.2512.18505"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["qualitative", "case-study"],
     13   "key_findings": "The paper presents a seminar design for teaching NLP students to critically evaluate conceptualization and operationalization of abstract concepts like bias, interpretability, and reasoning. Through scaffolded learning with interdisciplinary readings, discussion-based sessions, and structured critique assignments, students developed the ability to synthesize and critique research papers. The author reports that by the end of the term, students were independently identifying relevant literature and making cross-topic connections, with a third of student evaluations explicitly praising the discussion-oriented format.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code or materials repository is provided. The paper describes a seminar but does not release course materials, slides, assignment templates, or any supplementary artifacts."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No data is released. The reading list is provided in the paper (Appendix A) but no student evaluation data, assignment examples, or other course artifacts are shared."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a pedagogy paper describing a seminar course; there is no computational environment to specify."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper provides detailed reproduction instructions: Section 2 describes the course structure, Section 3 lists all readings by concept, Section 4 describes discussion facilitation methods, and Appendix B gives full assignment instructions, grading criteria, and presentation guidelines. A reader could recreate this seminar."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a qualitative pedagogy paper with no quantitative experiments or statistical results."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical comparisons are made; this is a course description paper."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative effects are measured."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No quantitative study is conducted; no sample size to justify."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs or quantitative measurements to report variance for."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a course description, not a comparative evaluation. There is no system or method to compare against baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No baselines applicable; this is a pedagogy paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation metrics are used; this is a course description."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper mentions student evaluations ('a third of student evaluations explicitly mentioned this as a positive') but does not present systematic human evaluation data or results. The evaluations are mentioned anecdotally, not as a formal assessment of the course's effectiveness."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No test sets involved; this is a pedagogy paper."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No quantitative results to break down."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses challenges: 'Some students had trouble with this at the beginning, instead just summarizing what each individual paper did' (Section 4), and the Limitations section notes scalability issues and challenges with LLM outsourcing."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The Limitations section reports that the course does not scale, balancing reading schedules is challenging, and student use of LLMs cannot be prevented."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to 'outline a seminar I created for students to explore these questions of conceptualization and operationalization, with an interdisciplinary reading list and an emphasis on discussion and critique.' The paper delivers exactly this outline in Sections 2-4 and the appendices."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims. It describes a seminar design and reports anecdotal observations about student progress, without claiming causal effects of the teaching approach."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section explicitly bounds scope: 'The primary limitation of this course is that it does not scale in its current form; a small classroom is essential.' The paper does not overclaim generalizability."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "The paper presents no empirical results that would require consideration of alternative explanations. It is a course description, not a study."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No measurements are taken; no proxy-outcome gap to address. This is a qualitative description of a teaching approach."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI/ML models are used in this paper's methodology."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used; this is a pedagogy paper."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No computational experiments with hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No data preprocessing; this is a course description."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present, discussing scalability, scheduling challenges, and the risk of students outsourcing thinking to LLMs."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to this course: 'a small classroom is essential for equal participation and for quality feedback with just one instructor' and 'it is challenging to balance a cohesive reading schedule... with optimizing for student-preferred presentation dates.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper clearly states it is presenting one seminar for Masters/advanced Bachelors students in computational linguistics and computer science. It does not claim the approach works at scale or in other contexts."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is shared — no student evaluations, assignment submissions, or assessment data are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper references student evaluations anecdotally ('a third of student evaluations explicitly mentioned this as a positive') but does not describe how evaluations were collected or analyzed."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "This is a course description, not a study with recruited participants. Students enrolled through normal university registration."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No data pipeline exists; this is a pedagogy paper describing a course."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is mentioned anywhere in the paper. The author is affiliated with Heidelberg Institute for Theoretical Studies but no funding acknowledgment is present."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliation is clearly stated: 'Heidelberg Institute for Theoretical Studies, Heidelberg, Germany.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding is disclosed; the paper describes a university seminar with no apparent external funding."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model evaluation or benchmarking is conducted."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is conducted."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This is a course description, not a human subjects study. Students are not research participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects research is conducted."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study; students are described only as 'Masters students and advanced Bachelors students in computational linguistics and computer science.'"
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not a human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not an experimental study with human subjects."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Not an experimental study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "Not a human subjects study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a pedagogy paper; no computational inference is performed."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No computational experiments conducted."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Students developed the skill of critique over the course of the seminar, moving from summarizing papers to synthesizing and critiquing them.",
    296       "evidence": "Section 4: 'Some students had trouble with this at the beginning, instead just summarizing what each individual paper did, but targeted feedback and examples helped. By the end of the term, some students were even bringing up points covered in other literature that we had not read in this course.'",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "A third of student evaluations explicitly mentioned the discussion-oriented format as a positive.",
    301       "evidence": "Section 4: 'A third of student evaluations explicitly mentioned this as a positive.' No raw evaluation data is provided.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Offering diverse topics and interdisciplinary papers allowed students in mixed undergraduate-graduate settings to participate.",
    306       "evidence": "Section 4 references Fosler-Lussier (2008) and states this was observed, but provides no systematic evidence.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "red_flags": [
    311     {
    312       "flag": "Anecdotal evidence only",
    313       "detail": "All claims about the seminar's effectiveness are based on the instructor's personal observations and brief mentions of student evaluations. No systematic assessment of learning outcomes, pre/post comparisons, or formal evaluation data is presented."
    314     },
    315     {
    316       "flag": "Selective reporting of evaluations",
    317       "detail": "The paper mentions 'a third of student evaluations explicitly mentioned this as a positive' without providing the full evaluation results, sample size, response rate, or any negative feedback from evaluations."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    323       "authors": ["DeepSeek-AI"],
    324       "year": 2025,
    325       "arxiv_id": "2501.12948",
    326       "relevance": "Major LLM reasoning system evaluated through reinforcement learning, relevant to AI capability assessment."
    327     },
    328     {
    329       "title": "It takes two to tango: Navigating conceptualizations of NLP tasks and measurements of performance",
    330       "authors": ["Arjun Subramonian", "Xingdi Yuan", "Hal Daumé III", "Su Lin Blodgett"],
    331       "year": 2023,
    332       "relevance": "Critiques how NLP tasks are conceptualized and operationalized — directly relevant to evaluation methodology quality."
    333     },
    334     {
    335       "title": "What will it take to fix benchmarking in natural language understanding?",
    336       "authors": ["Samuel R. Bowman", "George Dahl"],
    337       "year": 2021,
    338       "relevance": "Critique of NLP benchmarking practices, directly relevant to evaluation methodology in AI research."
    339     },
    340     {
    341       "title": "Are emergent abilities of large language models a mirage?",
    342       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    343       "year": 2023,
    344       "arxiv_id": "2304.15004",
    345       "relevance": "Challenges claims of emergent abilities in LLMs — relevant to methodology and claims assessment."
    346     },
    347     {
    348       "title": "The values encoded in machine learning research",
    349       "authors": ["Abeba Birhane", "Pratyusha Kalluri", "Dallas Card"],
    350       "year": 2022,
    351       "relevance": "Examines values embedded in ML research practices, relevant to understanding biases in AI research methodology."
    352     },
    353     {
    354       "title": "Language (technology) is power: A critical survey of 'bias' in NLP",
    355       "authors": ["Su Lin Blodgett", "Solon Barocas", "Hal Daumé III", "Hanna Wallach"],
    356       "year": 2020,
    357       "relevance": "Critical survey of how bias is conceptualized and operationalized in NLP — directly relevant to methodology quality assessment."
    358     },
    359     {
    360       "title": "The mythos of model interpretability",
    361       "authors": ["Zachary C. Lipton"],
    362       "year": 2018,
    363       "relevance": "Critiques the concept of interpretability in ML, relevant to how AI research defines and measures abstract concepts."
    364     },
    365     {
    366       "title": "A taxonomy and review of generalization research in NLP",
    367       "authors": ["Dieuwke Hupkes"],
    368       "year": 2023,
    369       "relevance": "Systematic review of generalization in NLP — relevant to evaluation methodology and benchmarking practices."
    370     }
    371   ]
    372 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs