calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (15495B)
      1 {
      2   "paper_slug": "agent-developer-practices-2025",
      3   "calibration_date": "2026-02-28",
      4   "model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 50,
      7   "disagreement_count": 0,
      8   "agreement_rate": 1.0,
      9   "disagreements": [],
     10   "opus_checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, Zenodo archive, or code release link is provided anywhere in the paper. The paper performs automated analysis using GPT-4o and GPT-4 but does not share scripts or code."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The study collects 1,575 GitHub repositories and 20,620 developer discussions but provides no download link or public dataset. The raw discussion corpus and cleaned datasets are not released."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency list is provided. The paper only mentions using GPT-4o and GPT-4 without specifying any library versions or environment details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No README, reproduction script, or step-by-step guide is provided. The methodology is described in natural language in Section 3 but not in a form that would allow direct replication."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results are reported as point estimates (e.g., '25.6%', '14%', '25%', '23%'). No confidence intervals, error bars, or uncertainty estimates are provided for any finding."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims across frameworks (e.g., AutoGen vs. LangChain on learning cost, development efficiency) but performs no statistical significance tests. All comparisons are frequency counts."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Only proportions and frequency counts are reported. No Cohen's d, odds ratios, relative risk, or formal effect size measures are provided for any comparison."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Sample sizes (1,575 repos, 8,710 discussions, 11,910 discussions) are described by filtering criteria but never justified relative to statistical power or adequacy for the claims being made."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measure is reported for any result. All findings are single aggregate proportions with no measure of uncertainty."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No comparison against prior empirical studies of agent frameworks. The paper claims to be 'the first' such study but does not compare findings with related work (e.g., Cemri et al., Yehudai et al.) quantitatively."
     65       },
     66       "baselines_contemporary": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No baselines are included, so the contemporariness of baselines cannot be assessed. The criterion is moot without baselines to evaluate."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "This is an observational mining/qualitative study, not a system with components to ablate. No experimental interventions are performed."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper evaluates frameworks across five dimensions (learning cost, development efficiency, functional abstraction, performance optimization, maintainability) and uses multiple popularity indicators (stars, forks, used-by, contributors, repo counts)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Two domain experts independently reviewed all functional role annotations (Section 4.1.2, Cohen's kappa = 0.82) and a two-researcher open coding procedure was used in Section 6.1 (kappa = 0.81). These constitute human evaluation of the LLM-generated classifications."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is an observational study mining GitHub discussions. There is no predictive model and no train/test split. The concept of a held-out test set does not apply."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down per framework for all five evaluation dimensions (Sections 6.2.1-6.2.5) and by challenge category (Logic, Tool, Performance, Version) across SDLC stages (Section 5.2)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "RQ2 (Section 5) is entirely dedicated to analyzing failure cases: infinite loops (Figure 7), memory management failures (Figure 8), version incompatibilities (Figure 9), with detailed examples and percentages."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that performance optimization is 'a common shortcoming across all frameworks' (Finding 12) and that AutoGen and LangChain face 'the highest maintenance complexity' (Finding 13)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims (ten frameworks identified, taxonomy of four challenge categories, five-dimensional evaluation framework) are all substantiated in the results sections. Specific percentages cited in the abstract match the body."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper uses causal language such as 'LangChain and CrewAI lower the technical threshold' and 'AutoGen and LangChain excel at rapid prototyping' derived from observational discussion data with no controlled comparison or causal identification strategy."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract and title present conclusions about 'agent developers' generally (e.g., 'more than 80% of developers report difficulties') without consistently scoping to GitHub open-source context. Section 7.3 acknowledges this limitation but the main claims are unbounded."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 7.3 discusses specific alternative explanations: LLM-generated bias in classification, topic tags not reflecting actual usage, and GitHub sample bias excluding closed-source environments."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'GPT-4o' and 'GPT-4' without specifying API version strings or snapshot dates. These are marketing names; model behavior varies across versions and snapshots."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Figure 5 provides the actual system and user prompts used for discussion summarization verbatim, including the full prompt text with the placeholder for batch_text."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 7.3 states 'we standardize the generation process by setting the temperature to 0.1 and using top-1 sampling.' Temperature and sampling strategy are explicitly reported."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The paper uses LLMs as classification tools in a pipeline, not as autonomous agents with agentic scaffolding. No scaffolding applies."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3 describes the full data collection and cleaning pipeline with explicit filtering criteria (compound keywords, non-empty README, stars>10, forks>5, recency) and counts at each stage (10,265 raw -> 8,710 cleaned)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 7.3 'Threats to Validity' provides a dedicated limitations discussion covering internal validity (LLM bias, topic tag accuracy) and external validity (GitHub-only sample)."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The threats are specific to this study: LLM-generated bias from using GPT-4o for summarization, topic tags possibly not reflecting actual framework usage, exclusion of closed-source enterprise projects."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 7.3 explicitly states 'Our sample covers only GitHub open-source projects, excluding closed-source projects, which may limit the generalizability of our conclusions.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The raw GitHub discussion corpus (8,710 and 11,910 discussions) is not made available for independent verification. No supplementary data files, dataset DOIs, or download links are provided."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3 describes data collection in detail: GitHub API search with compound keyword strategy, filtering criteria, time period (2022-July 2025), and metadata collected (stars, forks, used-by, contributors, topics, discussions)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "This is a repository mining study with no human participants. GitHub projects and discussions are the data source. The sampling criteria are described in Section 3 but this is not human recruitment."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Figure 1 shows the full pipeline from raw collection to analysis for both phases (RQ1/RQ2 and RQ3), with explicit counts at each step (10,265 raw -> duplicates removed -> invalid filtered -> 8,710 cleaned)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgements section discloses funding from the State Key Laboratory of Blockchain and Data Security at Zhejiang University, NSFC grants (62332004, 62276279), and Guangdong Basic and Applied Basic Research Foundation (2024B1515020032)."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All author affiliations are disclosed: Sun Yat-sen University (4 authors), Zhejiang University (1), University of Melbourne (1), Technical University of Munich (1). No authors are affiliated with the evaluated frameworks."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funders (NSFC, Guangdong research foundation, State Key Lab) are government and academic bodies with no financial stake in any of the ten frameworks evaluated."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "There is no competing interests statement, patent disclosure, or declaration of financial interests anywhere in the paper. Absence of a disclosure statement means the criterion is not satisfied."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This is a mining/observational study. GPT-4o/GPT-4 are used as classifier/summarizer tools, not evaluated for their capability on a benchmark. Contamination criteria do not apply."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation of pre-trained model capabilities is performed. The LLMs are tools for text classification, not the subject of evaluation."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No capability benchmark is used. The study evaluates developer discussions about agent frameworks, not pre-trained model performance."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Repository mining study with no human participants. Mining GitHub discussions does not constitute a human subjects study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. The study analyzes publicly available GitHub data."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. The study examines GitHub repositories and discussion threads."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. Repository selection criteria are described in Section 3 but these are data filtering criteria, not participant eligibility."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants and no experimental assignment. This is an observational study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants and not an experimental study. Blinding is not applicable."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants and no longitudinal tracking. Attrition is not applicable."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper uses GPT-4o and GPT-4 to summarize and classify 20,620 discussion threads but reports no API costs, token counts, or cost per example."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No GPU hours, API spend, wall-clock time, or hardware specifications are provided for the LLM-based analysis pipeline."
    281       }
    282     }
    283   }
    284 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs