scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22096B)
      1 {
      2   "paper": {
      3     "title": "AI Assistance in Legal Analysis: An Empirical Study",
      4     "authors": ["Jonathan H. Choi", "Daniel Schwarcz"],
      5     "year": 2023,
      6     "venue": "Journal of Legal Education, Volume 73, Number 2"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No code repository or archive is mentioned in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No dataset download link or data release is mentioned. Exam data and scores are not publicly available."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper specifies the GPT-4 model version (gpt-4-0314) and temperature=0 in the Appendix, but no environment file, requirements.txt, or detailed software setup is provided for reproducing the analysis pipeline."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions are provided. The study design is described narratively but there are no runnable scripts or explicit replication guides."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "95% confidence intervals are reported for all main results in Table 1 and in all figures, generated via 50,000 bootstrap iterations (footnote 63)."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Bootstrap-based significance tests are used throughout. The paper states 'All distributions, confidence intervals, and significance tests in this article were conducted by bootstrapping' (footnote 63)."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Effect sizes are reported as percentile-point changes with baselines: e.g., '+28.9 percentile-point improvement' for MC questions (Table 1), '+45 percentile points' for lowest performers, '-20 percentile points' for top performers."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No power analysis or sample size justification is provided. The sample sizes (16 for Insurance Law, 32 for Intro to American Law) are convenience samples based on volunteer recruitment, with no discussion of statistical power."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Standard deviations are reported for both human-only and AI-assisted conditions in Table 1, with confidence intervals on the standard deviations themselves."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Human-only performance (without AI) serves as the baseline, with comparisons to AI-assisted and AI-only conditions. Prior year (2022) human exam performance is used as the reference distribution."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The baseline is the contemporaneous human performance on the same exams (2022 cohort), which is the most appropriate comparison for this study design."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Four prompting strategies (basic, chain-of-thought, few-shot, grounded) are tested for AI-only conditions, serving as an ablation across prompting methods (Table 2)."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple metrics are reported: percentile performance, letter grades, speed/time to completion, and qualitative analysis of exam quality."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All exams were blindly graded by human evaluators (teaching assistants for Intro to American Law, the instructor for Insurance Law). Section III.B provides detailed qualitative evaluation."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "This is a human subjects experiment, not a benchmark evaluation with train/test splits."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by exam type (multiple-choice vs. essay), by course (Intro to American Law vs. Insurance Law), and by baseline student performance level (Figures 4, 12-17)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section III.B discusses specific weaknesses of AI-assisted exams: organizational problems, missed hidden legal issues, conclusory analysis, failure to cite case law covered in class."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key negative results are prominently reported: AI assistance had no effect on essay performance, top-performing students saw ~20 percentile-point declines with AI access."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The introduction's claims (29 percentile-point MC improvement, no essay improvement, 45-point gain for worst students, 20-point decline for best) are all supported by Table 1 and Figure 4."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The study makes causal claims ('AI assistance improved performance') but uses a within-subjects pre-post design without randomization to conditions. The authors acknowledge mean reversion as a confound (footnote 66) and note this is not an RCT (footnote 12 references their separate RCT study)."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section III.C explicitly bounds generalization: 'we should be cautious about extrapolating the results of our study to other settings, particularly nonlegal settings' and discusses four specific limitations on external validity."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section III.C discusses multiple alternative explanations: inadequate training, motivation differences, mean reversion (footnote 66), difficulty of integrating AI with human writing (Section IV), and that 'AI-only' methods involved some human input."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Appendix Section A.4 specifies 'the 8K GPT-4 model through OpenAI's API, using the March 14, 2023, version (gpt-4-0314).'"
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Full prompt text is provided in Appendix Figures 9-11 for chain-of-thought, few-shot, and grounded prompts, plus system prompts for essays and multiple-choice."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Appendix Section A.4: 'we set temperature to 0, as recommended by OpenAI.' Model context window (8K) is also specified."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used. The study uses direct single-turn prompting of GPT-4."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The data pipeline is well-documented: exam selection process (Appendix A.2), regrading procedure with correction factors (Section II.A), Spearman correlation between original and regraded scores (0.77 and 0.94)."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section III.C 'Study Limitations' is a dedicated subsection discussing four specific limitations."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section III.C discusses specific threats: inadequate one-hour training, motivation differences between study and real exam, non-representative sample (inverse-U shaped distribution, Appendix A.3), mean reversion confound, and limited generalizability to non-legal settings."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section III.C states specific scope boundaries: results may not generalize to non-legal settings, law school exams may not reflect real lawyering skills, the 'AI-only' label overstates autonomy since prompts required human selection of materials."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw data (exam scores, individual responses, grading rubrics) is made available for independent verification."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section II.A and Appendix A describe the data collection procedure in detail: recruitment via email, training process, exam administration, blind grading procedure, and regrading for cross-year consistency."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Appendix A.3 describes recruitment: emails sent to all enrolled students, flat fee paid for participation, participation rates (31% for Insurance Law, 23% for Intro to American Law), and attrition rates."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline is documented: recruitment → training → exam administration → blind grading mixed with 2022 exams → regrading of 2022 exams for calibration → correction factor applied → percentile calculation. Specific numbers provided at each stage."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding source or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are clearly stated: Choi at USC Gould School of Law, Schwarcz at University of Minnesota Law School. Schwarcz taught the courses used in the study."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure is present, so independence of funder cannot be assessed."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper uses GPT-4 (gpt-4-0314) on law school exam questions but does not state the model's training data cutoff date."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The paper addresses this: 'These exams were not publicly available' (Section II.A), indicating the 2022 exams used were not in GPT-4's training data. The exams were administered in prior years and not published online."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "By using unpublished prior-year exams that were 'not publicly available and had not been previously provided to students' (Section II.A), the authors mitigated contamination risk, though they do not explicitly discuss this as a contamination concern."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any pre-registration platform."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Footnote 39 states: 'This experiment received approval from the University of Minnesota's institutional review board. IRB #STUDY00019012.'"
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No participant demographics are reported beyond their enrollment in specific classes (undergraduates vs. law students). No age, gender, experience level, or other demographic information is provided."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Inclusion criteria are stated: enrolled students in the two specific courses. Appendix A.3 describes recruitment and that participants agreed before receiving final grades. Footnote 41 notes one undergraduate in Insurance Law."
    252       },
    253       "randomization_described": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "This study does not randomize participants to conditions. All participants receive AI assistance on the second exam (within-subjects design without randomization). The authors explicitly note in footnote 12 that their separate follow-up study was the 'first randomized controlled trial' on this topic."
    257       },
    258       "blinding_described": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "Blind grading is described: 'Exams produced by AI-assisted humans and by AIs alone were then mixed with exams written by real students in 2022 without AI assistance, and blindly graded' (Section II.A)."
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Footnote 42 reports attrition: Insurance Law had 18 sign up and 16 complete; Intro to American Law had 39 sign up and 32 complete."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No API costs or inference costs are reported despite using the GPT-4 API for both AI-only and AI-assisted conditions."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total computational budget or API spend is reported."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "AI assistance improved multiple-choice performance by 29 percentile points on average.",
    285       "evidence": "Table 1 shows mean change of +28.9 percentile points (CI: +20.3, +38.4) for Introduction to American Law multiple-choice questions.",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "AI assistance had no significant effect on essay question performance.",
    290       "evidence": "Table 1 shows +3.5 percentile points (CI: -6.7, +15.3) for Intro essay and -1.3 (CI: -9.7, +7.7) for Insurance Law, both including zero in the confidence interval.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "The worst-performing students benefited enormously from AI (~45 percentile-point gains) while the best-performing students saw ~20 percentile-point declines.",
    295       "evidence": "Figure 4 shows the relationship between baseline performance and AI benefit, with confidence intervals. However, the authors acknowledge this may be partially driven by mean reversion (footnote 66).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "GPT-4 with grounded prompting outperformed both humans and AI-assisted humans.",
    300       "evidence": "Table 2 shows grounded GPT-4 at 100th percentile for MC, 93rd for Intro essay, 65th for Insurance Law, compared to AI-assisted means of 88, 53, and 46 respectively.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "AI assistance significantly reduced exam completion time.",
    305       "evidence": "Section III.A.3: average time dropped from 74.5 to 62.9 minutes for Intro to American Law, described as 'a large and statistically significant drop' with bootstrap CIs shown in Figure 8.",
    306       "supported": "strong"
    307     }
    308   ],
    309   "methodology_tags": ["observational", "case-study"],
    310   "key_findings": "AI assistance dramatically improved law student performance on multiple-choice questions (+29 percentile points) but had no significant effect on essay exam performance. The benefit varied by student skill level: low-performing students gained ~45 percentile points while top students declined ~20 points, though mean reversion may partially explain this. GPT-4 alone with grounded prompting outperformed both unassisted humans and AI-assisted humans, suggesting optimal prompting may matter more than human-AI collaboration for certain legal analysis tasks.",
    311   "red_flags": [
    312     {
    313       "flag": "No randomization to conditions",
    314       "detail": "All participants received AI in the study condition and no AI in the real exam, making it a within-subjects pre-post design without randomization. The authors acknowledge this and note their separate RCT study, but causal claims in this paper rest on a weaker design."
    315     },
    316     {
    317       "flag": "Mean reversion confound",
    318       "detail": "The finding that low performers benefit most and high performers are harmed could be substantially driven by regression to the mean, as the authors themselves acknowledge (footnote 66). Without parallel forms reliability data, the magnitude of this confound is unknown."
    319     },
    320     {
    321       "flag": "Small sample sizes",
    322       "detail": "Only 16 students in Insurance Law and 32 in Introduction to American Law participated. No power analysis justifies these sample sizes for the subgroup analyses presented."
    323     },
    324     {
    325       "flag": "Non-representative volunteer sample",
    326       "detail": "Appendix A.3 acknowledges the sample underrepresented both low- and high-performing students (inverse-U shaped distribution), limiting generalizability of average treatment effects."
    327     },
    328     {
    329       "flag": "Different exam conditions",
    330       "detail": "Study participants took a different exam (prior year) under different conditions (after training, with different time pressure, lower stakes) than their real exams, introducing multiple confounds beyond AI access."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "ChatGPT Goes to Law School",
    336       "authors": ["Jonathan H. Choi", "Kristin E. Hickman", "Amy B. Monahan", "Daniel Schwarcz"],
    337       "year": 2022,
    338       "relevance": "Earlier study by same authors evaluating GPT-3.5 standalone performance on law school exams, foundational work in AI legal analysis evaluation."
    339     },
    340     {
    341       "title": "GPT-4 Passes the Bar Exam",
    342       "authors": ["Daniel Martin Katz", "James Michael Bommarito", "Shang Gao", "Pablo Arredondo"],
    343       "year": 2024,
    344       "relevance": "Prominent evaluation of GPT-4 on the Uniform Bar Examination, key benchmark for AI legal capability claims."
    345     },
    346     {
    347       "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
    348       "authors": ["Shakked Noy", "Whitney Zhang"],
    349       "year": 2023,
    350       "relevance": "RCT on AI productivity effects in professional writing tasks, finding greatest gains for lowest-skilled workers."
    351     },
    352     {
    353       "title": "Generative AI at Work",
    354       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey R. Raymond"],
    355       "year": 2023,
    356       "relevance": "Large-scale study of AI chatbot impact on customer service productivity, finding greatest gains for least-skilled workers."
    357     },
    358     {
    359       "title": "Lawyering in the Age of Artificial Intelligence",
    360       "authors": ["Jonathan H. Choi", "Amy B. Monahan", "Daniel Schwarcz"],
    361       "year": 2024,
    362       "relevance": "Follow-up RCT by same authors testing AI assistance on realistic legal tasks, the first RCT in this domain."
    363     },
    364     {
    365       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    366       "authors": ["Sida Peng"],
    367       "year": 2023,
    368       "relevance": "Empirical evaluation of AI coding assistance productivity effects, directly relevant to AI-augmented programming research."
    369     },
    370     {
    371       "title": "Combining Human Expertise with Artificial Intelligence: Experimental Evidence from Radiology",
    372       "authors": ["Nikhil Agarwal", "Alex Moehring", "Pranav Rajpurkar", "Tobias Salz"],
    373       "year": 2023,
    374       "relevance": "Study showing human-AI collaboration can fail even when AI alone outperforms humans, relevant to understanding AI assistance limitations."
    375     },
    376     {
    377       "title": "Large Legal Fictions: Profiling Legal Hallucinations in Large Language Models",
    378       "authors": ["Matthew Dahl"],
    379       "year": 2024,
    380       "relevance": "Study on LLM hallucination rates in legal analysis, relevant to AI reliability and safety in professional contexts."
    381     },
    382     {
    383       "title": "Re-Evaluating GPT-4's Bar Exam Performance",
    384       "authors": ["Eric Martínez"],
    385       "year": 2024,
    386       "relevance": "Methodological critique of GPT-4 bar exam claims, showing inflated performance estimates due to comparison population bias."
    387     },
    388     {
    389       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    390       "authors": ["Jason Wei"],
    391       "year": 2022,
    392       "relevance": "Foundational prompting technique paper used as basis for the chain-of-thought prompting method in this study."
    393     }
    394   ]
    395 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs