calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (18228B)
      1 {
      2   "paper_slug": "across-programming-language-2025",
      3   "total_questions": 50,
      4   "agreement_count": 48,
      5   "disagreement_count": 2,
      6   "agreement_rate": 0.96,
      7   "disagreements": [
      8     {
      9       "category": "statistical_methodology",
     10       "question": "effect_sizes_reported",
     11       "sonnet_applies": true,
     12       "sonnet_answer": false,
     13       "opus_applies": true,
     14       "opus_answer": true,
     15       "opus_justification": "The schema states that 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper's tables consistently show both baseline and enhanced absolute Pass@K values alongside percentage improvements (e.g., Table II: baseline 55.28 → 95.85, '73% up'). This provides the magnitude of effect with full baseline context. While no formal effect size metric like Cohen's d is used, the schema explicitly allows 'percentage improvement with baseline context' as sufficient.",
     16       "sonnet_justification": "The paper reports percentage improvements (e.g., '73% improvement', '+15.75% for multi-lingual LLMs') but without any standardized effect sizes (Cohen's d, odds ratios) or baseline context sufficient for formal effect size assessment. While relative improvements are shown, no formal effect size metric is used.",
     17       "direction": "opus_generous"
     18     },
     19     {
     20       "category": "conflicts_of_interest",
     21       "question": "funder_independent_of_outcome",
     22       "sonnet_applies": false,
     23       "sonnet_answer": false,
     24       "opus_applies": true,
     25       "opus_answer": false,
     26       "opus_justification": "The schema says 'NA if unfunded.' Sonnet marked applies=false reasoning that no funding is disclosed making it unassessable. However, the authors are at major academic institutions (Chinese Academy of Sciences, HKUST) where research is typically funded. The absence of a funding disclosure does not mean the work is unfunded — it means we cannot verify funder independence. Since it is not clearly unfunded solo independent work, applies=true, answer=false is more appropriate.",
     27       "sonnet_justification": "No funding is disclosed, making this criterion not assessable. Authors are from academic institutions with no apparent financial stake in any evaluated model.",
     28       "direction": "applies_boundary"
     29     }
     30   ],
     31   "opus_checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section X (Data Availability) provides a URL: https://anonymous.4open.science/r/Cross-Lingual-RACG-0F3C. The paper states 'We released the artifact and all experiment data' at this link."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The 13,910-instance Multilingual Code Dataset Expansion is released at the artifact URL (Section X). HumanEval-X is a publicly available dataset."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper mentions specific model names and greedy decoding settings (Section II.E), but provides no requirements.txt, Dockerfile, or library version list. Insufficient detail to recreate the environment."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper references the artifact URL but provides no step-by-step reproduction instructions within the paper text. No README commands or scripted replication pipeline described."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Only point estimates (Pass@K values) reported across all tables. The Std columns represent cross-language spread, not uncertainty over experimental runs. No confidence intervals or error bars provided."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Comparative claims (e.g., 'Java demonstrates superior cross-lingual utility over Python') are made throughout by comparing point estimates without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Tables consistently show absolute baseline and enhanced Pass@K values alongside percentage improvements (e.g., Table II: baseline 55.28 mean → 95.85 mean, '73% up'). Per the schema, 'percentage improvement with baseline context' is sufficient for YES. Both raw differences and relative improvements are provided with full context."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The dataset contains approximately 1K instances per language (Section II.C) but no power analysis or explicit justification for this sample size. The size was determined by data availability, not principled calculation."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Experiments use greedy decoding (temperature=0.0, Section II.E) producing single deterministic outputs. No variance across runs is reported. The Std columns in tables represent cross-language variation, not across-run variance."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All experiments compare against 'Baseline (without RAG/injection)' throughout Tables II-IX. Retrieval strategies are compared against each other in Table X (BM25, BGE-large, CodeRankEmbed)."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Models from 2023-2024 (CodeLlama, DeepSeek-Coder, Qwen2.5-Coder). CodeRankEmbed is from 2024. These are contemporary at time of writing."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Doc vs Doc w/o NL settings (removing NL comments), knowledge injection vs full RACG pipeline, and mono-lingual vs multi-lingual LLMs comparisons systematically isolate individual components."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Pass@K for generation evaluation (Section II.D), Precision@K and Recall@K for retrieval evaluation (Table X). Multiple metrics used across different experimental aspects."
    101       },
    102       "human_evaluation": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "This is a benchmark evaluation using automated test case execution (Pass@K). Human evaluation is clearly irrelevant when executable unit tests provide definitive correctness judgments."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Evaluation uses test case execution on generated code, which is independent of the retrieval corpus. Test cases are held out from the retrieval and generation process. The evaluation mechanism (pass/fail on unit tests) is separate from retrieved documents."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "All result tables provide per-programming-language breakdowns across all 13 PLs (Tables III, V, VI, IX). Language-specific performance variations are explicitly analyzed."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper discusses failure modes: PHP and Scala corpus degrading performance (Table V), Python mono-lingual LLMs suffering native language degradation, Perl and Go corpus causing widespread degradation for mono-lingual LLMs. Section VIII also acknowledges limitations."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Multiple negative results reported: cross-lingual RACG degrading mono-lingual LLMs on native Python (-4.57% in Table V), some corpora (PHP, Scala, Go) causing net-negative effects, cross-lingual knowledge sometimes interfering with existing strengths."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The four key abstract insights (effectiveness, inequality, robustness, specialization) are each supported by corresponding experimental results in Tables II-X. Abstract claims do not exceed what the results show."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The controlled knowledge injection setting (Section II.B.1) isolates the generation variable using an oracle retriever. Adversarial attacks (RQ3) directly manipulate corpus documents while holding other variables constant. These controlled manipulations are adequate for the causal inferences drawn."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper tests only 7B-parameter models but the conclusion claims to 'establish foundational insights for designing more powerful and safer code intelligence.' Section VIII acknowledges model size as a threat but abstract and conclusion still make broad, unbounded claims about multi-lingual RACG systems generally."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper asserts singular interpretations for findings without systematically considering alternatives. For example, the claim that 'multi-lingual LLMs rely on internal knowledge to resist adversarial perturbations' is not tested against alternatives (e.g., distributional shift, tokenization differences). Section VIII discusses experimental limitations, not alternative explanations for observed results."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Section II.E specifies exact model identifiers: CodeLlama-7B-Instruct, Deepseek-Coder-6.7B-Instruct, Qwen2.5-Coder-7B-Instruct, Phi-1, Phi-1.5, and CodeRankEmbed. These are specific HuggingFace model names mapping to unique checkpoints."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Section II.E states 'we use a unified prompt template for LLMs, following the design in [19] and [43]' but the actual prompt text is not provided in the paper or appendix. Readers must consult referenced papers to reconstruct prompts."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section II.E reports temperature=0.0 (greedy decoding), top-K=3 retrieval window, and fixed random seed of 42 for perturbation experiments. Key hyperparameters are stated."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The RACG pipeline is described in Section II: retriever (CodeRankEmbed) retrieves top-K documents, injected into prompt for LLM generation. Figure 1 illustrates the full pipeline with four experimental settings. The scaffold is sufficiently described."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section II.C documents dataset construction: starting from HumanEval-X and extensions of [43][44], LLM-generated reference solutions via Qwen2-72B-Instruct-GPTQ-Int4, 5 iterations of generation + unit test verification, per-language counts documented."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VIII 'Threats to Validity' is a dedicated section discussing methodological limitations."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section VIII identifies two specific threats: (1) models not exceeding 7B parameters may not represent broader LLM landscape, and (2) fixed random seed (42) limits perturbation diversity. Both are specific to this study with mitigation measures described."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not explicitly state what the results do NOT show. Section VIII hedges about model representativeness but does not state specific untested claims. The conclusion broadly claims to 'establish foundational insights for designing more powerful and safer code intelligence' without bounding these claims to the tested setting."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The artifact URL (Section X) includes 'all experiment data' including the 13,910-instance dataset. Raw data is available for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section II.C describes dataset construction: source datasets (HumanEval-X, [43], [44]), LLM solution generation, unit test verification across 5 iterations, per-language instance counts."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data sourced from existing benchmarks (HumanEval-X, McEval, MultiPL-E) and LLM-generated solutions verified by unit tests."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Section II.C documents the full pipeline: source datasets → LLM generation of missing solutions → unit test verification over 5 iterations → 13,910 final instances with per-language counts (Python: 1181, Kotlin: 1071, etc.)."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No acknowledgments section or funding disclosure appears in the paper. No mention of grants, corporate sponsors, or funding agencies."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations clearly listed: Chinese Information Processing Laboratory (Institute of Software, CAS), University of Chinese Academy of Sciences, and Hong Kong University of Science and Technology."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed. Authors are at major academic institutions (CAS, HKUST) where research is typically funded. Since funding is not disclosed, funder independence cannot be verified. This is not clearly unfunded solo independent work, so the criterion applies but is not satisfied."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement appears in the paper. Absence of disclosure is not absence of conflict."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper evaluates models (CodeLlama, DeepSeek-Coder, Qwen2.5-Coder, Phi-1, Phi-1.5) on HumanEval-X without stating any model's training data cutoff date."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "HumanEval-X is derived from HumanEval (published 2021). All evaluated models were trained on data post-2021 and may have seen HumanEval problems. No discussion of potential train/test overlap."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "HumanEval was published in 2021; all tested models (CodeLlama-2023, DeepSeek-Coder-2024, Qwen2.5-Coder-2024, Phi-2023) have training cutoffs after its release. The paper does not address whether HumanEval problems appeared in training data."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. This is a benchmark evaluation of LLM-based code generation systems."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants. Not an experimental study with human subjects."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants. Blinding not applicable to automated benchmark evaluation."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time reported. Given the scale of experiments (13,910 instances across multiple models and settings), this is a notable omission."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No GPU hours, hardware specifications, or total compute budget stated despite using Qwen2-72B for dataset generation and multiple 7B models for evaluation."
    302       }
    303     }
    304   }
    305 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs