calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (15700B)
      1 {
      2   "paper_slug": "advancing-software-quality-2025",
      3   "calibration_date": "2026-02-28",
      4   "model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 48,
      7   "disagreement_count": 2,
      8   "agreement_rate": 0.96,
      9   "disagreements": [
     10     {
     11       "question": "evaluation_design.baselines_contemporary",
     12       "sonnet": {"applies": false, "answer": false},
     13       "opus": {"applies": true, "answer": false},
     14       "direction": "applies_boundary",
     15       "explanation": "Sonnet set applies=false reasoning that since no baseline comparisons were made, contemporariness is not applicable. Opus considers that since baselines_included has applies=true (prior surveys exist and should be compared), baselines_contemporary also applies — the question is whether any baselines used are contemporary, and the paper could have included contemporary survey comparisons but did not. The paper simply fails to compare against any prior surveys at all, making applies=true, answer=false the correct assessment."
     16     },
     17     {
     18       "question": "conflicts_of_interest.funder_independent_of_outcome",
     19       "sonnet": {"applies": false, "answer": false},
     20       "opus": {"applies": true, "answer": false},
     21       "direction": "applies_boundary",
     22       "explanation": "Sonnet set applies=false because no funding source is disclosed, interpreting this as 'unfunded.' Opus considers that the author is affiliated with Juniper Networks Inc., which could be the implicit funder (work done on company time). The schema says NA only 'if unfunded,' but absence of a funding disclosure is not evidence of being unfunded — it is a failure to disclose. Since the author has a corporate affiliation, the criterion applies, and the lack of any independence statement means answer=false."
     23     }
     24   ],
     25   "opus_checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No code repository URL, GitHub link, or Zenodo archive is provided anywhere in the paper. The survey could have released analysis scripts, search query logs, or paper classification data, but nothing is made available."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The corpus of 223+ reviewed papers is not released as a structured dataset. No supplementary data file, spreadsheet, or downloadable list of included papers is provided."
     36       },
     37       "environment_specified": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a pure literature survey with no computational experiments, so there is no software environment to specify."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided. A reader cannot reproduce the literature search or paper selection process because search databases, queries, and filtering pipeline are not described."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "This is a descriptive literature survey reporting raw counts and percentages of papers (Figures 1-6). No statistical aggregation requiring confidence intervals is performed."
     53       },
     54       "significance_tests": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No comparative statistical claims are made. The paper reports descriptive counts and proportions from the surveyed corpus without testing differences."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "This is a descriptive survey with no meta-analytic aggregation of effect sizes from reviewed papers."
     63       },
     64       "sample_size_justified": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No experiments or statistical analyses requiring sample size justification are conducted. This is a literature survey."
     68       },
     69       "variance_reported": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No experimental runs are conducted. The survey reports descriptive statistics from the literature corpus."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The survey does not compare itself against any prior surveys in the LLM-SQA space. No prior reviews are referenced as baselines to demonstrate the additive value of this survey."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Since baselines_included applies (prior surveys in LLM-SQA exist and could be compared against), the contemporariness of those baselines also applies. However, since the paper includes no baseline comparisons at all, it cannot be assessed whether baselines are contemporary. Answer is false because no baselines are used."
     85       },
     86       "ablation_study": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is a survey paper with no system components to ablate."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The survey characterizes the literature across multiple dimensions: publication trends (Figure 1), dataset utilization (Figure 2), evaluation approaches (Figure 3), fine-tuning adoption (Figure 4), LLM usage distribution (Figure 5), and prompting strategies (Figure 6)."
     95       },
     96       "human_evaluation": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "Human evaluation of the survey's own outputs is not clearly relevant for this type of literature review."
    100       },
    101       "held_out_test_set": {
    102         "applies": false,
    103         "answer": false,
    104         "justification": "This is a survey paper; no train/test split is applicable."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table I provides a detailed per-standard, per-application breakdown mapping LLM applications to ISO/IEC 12207, 25010, 5055, ISO 9001, CMMI, and TMM. Figures 1-6 also break down by multiple categories."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section VI (Challenges, Limitations, and Risks) discusses failure modes of LLM-based SQA including data privacy exposure, model bias, black-box explainability, and resource constraints."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section V.B reports that 'nearly 30% of the reviewed papers did not specify any dataset,' Section V.E reports 'nearly 19% of papers did not specify which LLM was used,' and Section VI enumerates challenges and failure patterns across the surveyed literature."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract promises a survey of LLM-SQA intersection with standards mapping, case studies, challenges, and future directions. The paper delivers on these with Section V (literature characterization), Table I (standards mapping), Section VI (challenges), and Section VII (future directions)."
    127       },
    128       "causal_claims_justified": {
    129         "applies": false,
    130         "answer": false,
    131         "justification": "The paper is a survey that summarizes existing work. It uses hedged possibility language ('can enhance,' 'can augment') when describing LLM capabilities. It makes no original causal claims requiring causal inference study design."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper's abstract claims to 'provide a comprehensive blueprint for integrating LLMs into SQA in a trustworthy, efficient, and standards-aligned manner' without adequately bounding generalizations to the specific 2023-2025 papers reviewed. The conclusions generalize broadly despite the surveyed literature itself having significant gaps (30% no dataset, 19% no LLM specified)."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "This is a descriptive survey that does not present original empirical results requiring alternative explanations. The schema specifies NA for 'pure surveys or taxonomies.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "This is a survey paper that does not run experiments with LLMs."
    149       },
    150       "prompts_provided": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No LLM prompting is used in this survey paper."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No LLM experiments are conducted in this survey."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used; this is a literature survey."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Section IV describes 7 high-level inclusion criteria but does not document the actual search process: no databases queried, no search strings, no initial pool size, no filtering stages with counts. 'Over 223 papers' are mentioned with no explanation of how the starting pool was identified or reduced. This fails the schema requirement for filtering CRITERIA at each stage."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "Section VI is titled 'Challenges, Limitations, and Risks' but discusses limitations of LLM technology in SQA generally, not limitations of THIS survey's own methodology. There is no dedicated discussion of the survey's methodological shortcomings (e.g., single reviewer, informal search, no quality assessment of included papers)."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity specific to this survey are discussed. Missing: selection bias from informal search, lack of systematic review protocol, single-author classification without inter-rater reliability, possible incomplete literature coverage."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The survey does not explicitly state what it does NOT cover. No mention of excluded languages, excluded databases, excluded gray literature, or which LLM application areas are out of scope."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The full list of 223+ reviewed papers is not provided as a structured, verifiable dataset. Readers cannot independently verify which papers were classified into which categories."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "Section IV lists inclusion criteria but omits the data collection procedure: no search databases named (e.g., IEEE Xplore, ACM DL, Scopus), no search queries provided, no time window for the search itself stated."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants involved. This is a literature survey using published papers as data sources, not a study with human subjects."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No PRISMA-style flowchart or equivalent. The pipeline from initial paper discovery to the final 223+ papers is undocumented. No counts of papers retrieved, screened, or excluded at each stage."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No acknowledgments section, funding disclosure, or grant information is present in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The author's affiliation with Juniper Networks Inc. is clearly disclosed on the title page alongside an ORCID identifier."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The author is affiliated with Juniper Networks Inc. but no funding source is disclosed. Absence of a funding disclosure is not evidence of being unfunded — the work could have been done on company time. Since we cannot verify funder independence, answer is false."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or declaration of financial interests appears anywhere in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This is a literature survey that does not evaluate any pre-trained model on a benchmark."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluation of pre-trained models is conducted."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No benchmark evaluation of pre-trained models is conducted."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants involved; this is a literature survey."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants involved; this is a literature survey."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants involved; this is a literature survey."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants involved; this is a literature survey."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants involved; this is a literature survey."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants involved; this is a literature survey."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants involved; this is a literature survey."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a survey paper with no original LLM experiments."
    291       },
    292       "compute_budget_stated": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "This is a survey paper with no original computational experiments."
    296       }
    297     }
    298   }
    299 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs