calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (19032B)
      1 {
      2   "paper_slug": "adaptive-test-generation-2023",
      3   "total_questions": 50,
      4   "agreement_count": 49,
      5   "disagreement_count": 1,
      6   "agreement_rate": 0.98,
      7   "disagreements": [
      8     {
      9       "category": "evaluation_design",
     10       "question": "human_evaluation",
     11       "sonnet_applies": false,
     12       "sonnet_answer": false,
     13       "opus_applies": true,
     14       "opus_answer": false,
     15       "opus_justification": "The paper makes explicit claims about test readability and naturalness (e.g., 'tests look quite natural and similar to tests that a human developer might write' in Section 2.3, and readability comparisons with Nessie throughout). These readability/naturalness claims make human evaluation relevant, not 'clearly irrelevant to the claims' as the schema requires for applies=false. The paper itself acknowledges this: 'we do not formally assess the readability of these tests. In the future, it would be interesting to conduct user studies to assess the readability' (Section 5, Construct Validity). Since the paper could reasonably be expected to include human evaluation of test quality given its readability claims, applies=true with answer=false is correct.",
     16       "sonnet_justification": "The evaluation is entirely automated (coverage measurement, test pass/fail). The paper notes that readability assessment via user studies is future work; no human evaluation of test quality is performed.",
     17       "direction": "applies_boundary"
     18     }
     19   ],
     20   "opus_checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states TESTPILOT 'is available as open-source software at https://github.com/githubnext/testpilot' (Section 1, contributions). A working GitHub URL is provided."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states 'The raw data and analysis for all our experiments can be found at https://doi.org/10.6084/m9.figshare.23653371' (end of Section 1)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions JavaScript, Node.js, Mocha, Istanbul/nyc, and a GitHub Actions Linux VM (2-core CPU, 7GB RAM, 14GB SSD) but does not provide a requirements.txt, Dockerfile, or detailed dependency specifications with library versions sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While code and data are released via GitHub and Figshare respectively, the paper does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports median values across 10 runs but does not report confidence intervals or error bars for the main coverage results. Tables 2 and 4 show single-value medians without uncertainty measures."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper uses Wilcoxon paired rank-sum tests to compare TESTPILOT vs. Nessie (Section 4.2, p-values 0.002 and 0.027) and Wilcoxon matched-pairs signed rank tests for ablation comparisons (Section 4.5) and LLM comparisons (Section 4.7)."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Cliff's delta effect sizes are reported for all major comparisons: 0.493 (large) for statement coverage and 0.431 (medium) for branch coverage in the Nessie comparison (Section 4.2), and multiple effect sizes in Sections 4.5 and 4.7."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper evaluates 25 npm packages with selection criteria described (domain diversity, popularity levels, Nessie overlap) but does not justify why 25 is a sufficient sample size or perform a power analysis."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper explicitly states 'we run all experiments 10 times' and reports medians, but does not report standard deviation, IQR, or any spread measure alongside the coverage medians in Tables 2-4. The reader cannot assess result stability across runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares TESTPILOT against Nessie, described as 'the state-of-the-art JavaScript test generator' (RQ2, Section 4.2), and also compares across three LLMs (RQ7, Section 4.7)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Nessie was published at ICSE 2022 and is described as the current state-of-the-art feedback-directed random test generation technique for JavaScript. This is contemporary with the 2023 paper."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "RQ5 (Section 4.5) presents an ablation study disabling each of four prompt refiners (FnBodyIncluder, DocCommentIncluder, SnippetIncluder, RetryWithError) one at a time, with statistical significance testing (Wilcoxon tests, Cliff's delta)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper reports statement coverage, branch coverage, percentage of passing tests, non-trivial assertion rate, non-trivial coverage, and test similarity (normalized edit distance) across RQ1-RQ6."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "The paper makes claims about test readability and naturalness (Section 2.3: 'tests look quite natural and similar to tests that a human developer might write') but performs no human evaluation. The paper acknowledges this gap in Section 5 (Construct Validity): 'we do not formally assess the readability of these tests. In the future, it would be interesting to conduct user studies.' Human evaluation is relevant to these readability claims, making applies=true, but no such evaluation was performed."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The 25 npm packages serve as the evaluation benchmark (Table 1). No tuning is performed on these packages; TESTPILOT generates tests at inference time without using test examples from the packages."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 2 provides per-package coverage results for all 25 npm packages, Table 3 shows per-package non-trivial test results, Table 4 provides per-package LLM comparisons, and Figure 5 shows per-function coverage distributions."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "RQ4 (Section 4.4) provides detailed analysis of failing tests categorized by type: assertion errors, file-system errors, correctness errors, timeout errors, and other errors. Figure 7 shows per-package breakdown. Specific examples of failures are discussed."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The ablation study (RQ5) shows cases where prompt refiners hurt performance. Figure 9 demonstrates a specific case where adding function body confuses the model. Section 4.5 notes 'in only 394 cases (7.3%) the refined prompt was less effective than the original prompt.' StarCoder achieves lower coverage than the OpenAI models (RQ7)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "All major abstract claims are verified in the results: 70.2% median statement coverage (Table 2), 52.8% branch coverage (Table 2), Nessie comparison showing superiority (Section 4.2), ablation findings (Section 4.5), similarity analysis (Section 4.6), and multi-LLM comparison (Section 4.7)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The ablation study (RQ5) makes causal claims about each prompt component's contribution. These are supported by controlled single-variable manipulation (disabling one refiner at a time) with Wilcoxon tests and Cliff's delta effect sizes (Section 4.5). This is an adequate ablation design."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly bounds its claims: 'while our technique is conceptually language-agnostic, our current implementation of TESTPILOT targets JavaScript, and thus we cannot generalize our results to other languages' (Section 5, External Validity). It also notes results may not generalize to 'proprietary code that was never seen in the LLM's training set.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 5 discusses specific alternative explanations: snippet matching heuristics may cause inaccuracies (Internal Validity), non-trivial assertion definition sets a low bar (Construct Validity), performance may not generalize to proprietary code unseen during training (External Validity). The memorization concern is addressed as an alternative explanation for high coverage in RQ6."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 3.2 specifies 'gpt-3.5-turbo-0301' (a specific snapshot date version), 'code-cushman-002' (specific model), and StarCoder (with HuggingFace reference). The gpt-3.5-turbo-0301 version includes a specific snapshot date."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The full prompt structure is shown in Figure 1, and concrete populated prompt examples appear in Figures 3 and 4 with actual values. Algorithm 2 specifies exactly what information goes into each prompt. The reader can reconstruct the exact prompts sent to the model."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.2 states: 'sampling five completions of up to 100 tokens at temperature zero, with all other options at their default values.' StarCoder uses temperature 0.01. The Mocha timeout is 2 seconds per test."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 2 describes TESTPILOT's five-component architecture (API Explorer, Documentation Miner, Prompt Generator, Test Validator, Prompt Refiner) in detail, with Algorithms 1 and 2 providing pseudocode, and Figure 2 providing an architecture diagram. The retry-with-error feedback loop is fully described."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The documentation mining process (Section 2.1) describes how code snippets are extracted from Markdown files and matched to functions via textual containment. Algorithm 1 documents the API exploration procedure. Test deduplication via comment stripping and normalization is described."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5 'THREATS TO VALIDITY' contains dedicated subsections on Internal Validity, Construct Validity, and External Validity with substantive discussion spanning over a full page."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Threats are specific to this study: snippet matching heuristic for same-name functions (internal validity), low bar for non-trivial assertion definition and limitations of backwards slicing in JavaScript (construct validity), dependence on 25 npm packages, potential non-generalizability to proprietary code, and JavaScript-only limitation (external validity)."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper explicitly states 'we cannot generalize our results to other languages' and that performance may not generalize to 'proprietary code that was never seen in the LLM's training set' (Section 5, External Validity). These are specific scope boundaries about what the results do NOT show."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Raw data is available at https://doi.org/10.6084/m9.figshare.23653371, explicitly stated at the end of Section 1."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 3.2 describes how the 25 npm packages were selected: first 10 from the Nessie benchmark, 10 new GitHub-hosted packages from different domains with varying popularity, and 5 GitLab-hosted packages to mitigate training data contamination."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants in this study. The data source is publicly available npm packages on GitHub and GitLab, not human subjects."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The complete pipeline from API exploration (Algorithm 1) through test generation (Algorithm 2) to deduplication and coverage measurement is documented. Each step including test validation, prompt refinement, and duplicate elimination is described."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The Acknowledgment section lists NSF grants CCF-1907727 and CCF-2307742 (F. Tip) and Canada Research Chairs Program and NSERC RGPIN-2017-04289 (S. Nadi)."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are disclosed on the first page: M. Schafer with GitHub UK, S. Nadi with University of Alberta, A. Eghbali with University of Stuttgart, F. Tip with Northeastern University. The Acknowledgment also notes S. Nadi and F. Tip were sabbatical visitors and A. Eghbali an intern at GitHub."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "M. Schafer is employed by GitHub, and the Acknowledgment states the other authors conducted this research while at GitHub (sabbatical visitors/intern). GitHub has a commercial interest in demonstrating LLM-based code tooling effectiveness, directly related to GitHub Copilot. The tool TESTPILOT is hosted under GitHub Next."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "There is no competing interests statement or declaration of financial interests. The GitHub affiliation is listed but no formal declaration of whether authors hold equity, patents, or other financial interests related to the findings."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper acknowledges that gpt3.5-turbo 'was trained on GitHub repositories' but does not state a specific training data cutoff date for any of the three LLMs (gpt-3.5-turbo-0301, code-cushman-002, or StarCoder)."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "RQ6 (Section 4.6) is entirely dedicated to investigating memorization by measuring maximum similarity (normalized edit distance) between generated tests and existing tests. The paper explicitly addresses whether TESTPILOT reproduces memorized tests."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "The paper acknowledges 'since gpt3.5-turbo...was trained on GitHub repositories, we have to assume that all our subject packages...were part of the model's training set' (Section 3.2), and mitigates by including 5 GitLab packages not in GitHub training data, plus measuring test similarity in RQ6."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study. It evaluates automated test generation on npm packages."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Section 4.7 reports wall-clock time: 'median time for TESTPILOT to generate tests for a given function using gpt3.5-turbo is 15s, and the median time to generate a complete test suite for a given package is 6m 55s.' Similar timings given for StarCoder (24s/function, 10m 48s/package) and code-cushman-002 (11s/function, 4m 53s/package). No dollar API cost is reported."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The paper mentions tests were run on 'a standard GitHub Actions Linux VM with a 2-core CPU, 7GB of RAM, and 14GB of SSD disk space' but does not state total API spend, GPU hours, or total compute budget consumed for the full set of experiments (25 packages x 10 runs x 3 LLMs)."
    291       }
    292     }
    293   }
    294 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs