ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26726B)


      1 {
      2   "paper": {
      3     "title": "The AI Productivity Index: APEX-v1-extended",
      4     "authors": [
      5       "Bertie Vidgen",
      6       "Abby Fennelly",
      7       "Evan Pinnix",
      8       "Julien Benchek",
      9       "Daniyal Khan",
     10       "Zach Richards",
     11       "Austin Bridges",
     12       "Calix Huang",
     13       "Kanishka Sahu",
     14       "Abhishek Kottamasu",
     15       "Bo Ma",
     16       "Ben Hunsberger",
     17       "Isaac Robinson",
     18       "Akul Datta",
     19       "Chirag Mahapatra",
     20       "Dominic Barton",
     21       "Cass R. Sunstein",
     22       "Eric Topol",
     23       "Brendan Foody",
     24       "Osvald Nitski"
     25     ],
     26     "year": 2025,
     27     "venue": "arXiv",
     28     "arxiv_id": "2509.25721",
     29     "doi": "10.48550/arXiv.2509.25721"
     30   },
     31   "checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper states 'Our eval harness is available on Github' (Section 1) with a reference to 'Mercor on Github'. A GitHub repository is cited."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper releases 'n = 25 non-benchmark example cases per job (n = 100 total)' as 'apex-v1-devset' on Hugging Face with a CC-BY license (Section 1). The main n=400 heldout set is intentionally kept closed for benchmark integrity."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned. The paper does not describe how to set up the evaluation harness environment."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "While a GitHub repository and devset are referenced, no step-by-step reproduction instructions are provided in the paper. There is no 'Reproducing Results' section or commands to run."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "95% confidence intervals are reported for model scores. Table 3 shows z-scores with 95% CIs (e.g., 'GPT 5 (High) 0.50, 0.42–0.58'), and Figure 1 displays confidence intervals visually."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Friedman omnibus test is used (p < 0.000001) for overall model differences, and paired t-tests with Bonferroni correction are used for pairwise comparisons (Section 4.2): '35 of the 45 pairwise comparisons (78%) remain statistically significant at the adjusted threshold (p = 0.001).'"
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Z-scores are reported as a measure of effect size (Table 3), showing how many standard deviations each model's performance is above/below the mean. Raw percentage differences are also provided with baseline context (e.g., GPT 5 at 67.0% vs. second-best at 64.3%)."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The benchmark was increased from n=200 to n=400 and runs increased from 3 to 8, but no power analysis or formal justification for these specific numbers is provided. No discussion of whether n=100 per job is sufficient for the granularity of claims made."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Standard deviation across the 8 runs is reported: 'the mean standard deviation over the eight runs is 9.05 percentage points, ranging from 8.25 (Claude Opus 4.5) to 9.5 (Grok 4)' (Section 4.1)."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "10 frontier models are compared against each other across four job domains (Table 2). This is primarily a leaderboard paper, and the inter-model comparison serves as the baseline structure."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "All 10 models tested are frontier models from late 2025: GPT 5, GPT 5.1, Gemini 3 Pro, Gemini 2.5 Pro/Flash, Claude Opus 4.5/4.1, Sonnet 4.5, o3, and Grok 4. These are contemporary and competitive."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No ablation study is performed. The paper does not investigate which components of the benchmark design (rubric structure, source document inclusion, prompt format) contribute to the observed performance differences."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper reports both raw mean percentage scores and z-scores (Table 2, Table 3), which capture different aspects of performance — raw scores for absolute capability and z-scores for relative standing adjusting for task difficulty."
    101       },
    102       "human_evaluation": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "All grading is done by a single LM judge (Gemini 2.5 Flash). No human evaluation of model outputs is reported. The paper acknowledges switching from a panel of LM judges to a single LM judge but does not validate against human judgments for the extended benchmark."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper explicitly maintains a held-out evaluation set of n=400 cases: 'The n = 400 cases will remain a closed heldout dataset for rigorous evaluation of frontier models' (Section 1). The devset (n=100) is separate and open-sourced."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 2 provides per-job breakdowns (Investment banking, Law, Management consulting, Medicine) for all 10 models, showing meaningful performance variation across domains."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "No qualitative failure analysis is provided. The paper reports aggregate scores but does not examine specific cases where models failed, what types of tasks are hardest, or why certain rubric criteria are more challenging."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper notes that 'APEX-v1-extended shows that frontier models still have substantial limitations when performing typical professional tasks' (Abstract). It also highlights that some models (GPT 5.1, Gemini 2.5 Flash, Sonnet 4.5) score 10+ percentage points lower on the devset, and that Investment banking remains particularly difficult."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract claims GPT 5 (Thinking = High) is the top model at 67.0%, which matches Table 2. The claim that 'frontier models still have substantial limitations' is supported by the observation that no model exceeds 67% overall."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper makes implicit causal claims about thinking mode: models are tested with 'Thinking = High' as if this setting causes better performance, and the z-score analysis implies Opus 4.5 is 'particularly strong on the hardest tasks' (Section 4.2). No controlled analysis isolates the effect of thinking mode vs. model architecture."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The title 'The AI Productivity Index' and framing suggest this measures general AI productivity for professional work, but the benchmark covers only 4 specific US-centric professional roles with n=100 cases each. The paper does not bound its claims to these specific jobs and settings."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No discussion of alternative explanations for the results. For example: could differences be driven by the LM judge's biases rather than genuine model capability differences? Could source document parsing affect some models more than others? The paper does not address these possibilities."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Models are identified by marketing names only: 'GPT 5 (Thinking = High)', 'Gemini 3 Pro (Thinking = High)', 'Opus 4.5 (On)', etc. No API version identifiers, snapshot dates, or specific model IDs are provided. The paper states 'Responses were collected at the end of November 2025' but gives no model version strings."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The LM judge prompt is provided in full in Appendix A. The instruction appended to every prompt is provided in full in Appendix B. An example prompt and rubric are shown in Figure 3. The actual task prompts are in the released devset."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Only temperature is mentioned: 'If temperature can be configured, we set it to 0.7' (Section 3.1). No other hyperparameters (top-p, max tokens, etc.) are reported. The conditional 'if temperature can be configured' leaves ambiguity about which models used what settings."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. Models receive single-turn prompts with attached source documents and produce single-turn responses."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Source document processing is described: 'Sources were parsed using Reducto and appended to the prompt in the context window' (Section 2.3). The production process is shown in Figure 2. The quality control pipeline (expert creation, review, approval/rejection) is described in Section 2.2."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "There is no Limitations, Threats to Validity, or Discussion section in the paper. The paper goes from Results (Section 4) directly to Acknowledgments (Section 5) and References."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No threats to validity are discussed anywhere in the paper. Issues such as LM judge reliability, rubric quality variation, source document parsing fidelity, or benchmark representativeness are not addressed."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what professional tasks are excluded, or what limitations exist in generalizing from these 4 jobs to broader AI productivity claims."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The main benchmark data (n=400 heldout set) is not released. Only the devset (n=100) is open-sourced. Raw model responses, judge outputs, and per-criterion scores are not publicly available for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 2 describes the data collection procedure in detail: expert selection from the Mercor platform, interview process, assessment, prompt creation by experts based on their day-to-day work, rubric creation, and multiple rounds of review (Sections 2.1-2.4)."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Expert recruitment is described in Section 2.1: 'From the Mercor platform we sourced experts with extensive professional experience, and prioritized experts with data labeling experience.' The interview process (30-45 min interview, 1-2 hour assessment) and ongoing quality management are described."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The production process is shown in Figure 2 and described across Sections 2.1-2.4, covering expert selection, prompt creation, review, rubric creation, and source document processing. The pipeline from APEX-v1.0 to APEX-v1-extended is documented (recalibration of original 50 + 50 new cases per job)."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding information is disclosed. The Acknowledgments section (Section 5) thanks annotators and advisors but does not mention funding sources. Mercor is a for-profit company, and the funding for this work is not stated."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly listed: most authors are affiliated with Mercor (superscript 1), with Cass R. Sunstein at Harvard Law School (superscript 2) and Eric Topol at The Scripps Research Institute (superscript 3)."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Mercor is an AI hiring/talent platform that benefits commercially from AI productivity measurement. The benchmark is branded with Mercor's name and hosted on their website (mercor.com/apex). The funder (Mercor) has a direct commercial interest in demonstrating that AI productivity can be measured and benchmarked, making them non-independent of the outcome."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is present in the paper. Given that Mercor is a commercial company that could benefit from the benchmark's adoption and visibility, the absence of any disclosure is a gap."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for any of the 10 models tested. The paper only mentions that responses were collected 'at the end of November 2025' but does not discuss when models' training data was collected."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of potential train/test overlap. Given that APEX-v1.0 was published in September 2025 and the devset is open-sourced, models trained after that date could have seen the benchmark design or similar tasks. This is not addressed."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "The paper addresses contamination risk through design: 'The n = 400 cases will remain a closed heldout dataset for rigorous evaluation of frontier models' (Section 1). Keeping the evaluation set private is a direct contamination mitigation, though the paper does not explicitly discuss contamination as a concept."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "This is a benchmark evaluation paper, not a human subjects study. The experts who created the benchmark are contributors/annotators, not research participants."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are studied. Expert annotators are paid contractors creating benchmark content, not research subjects."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are studied. Some expert characterization is provided (137 experts, 7+ years mean experience) but this describes benchmark creators, not research participants."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are studied as research subjects. Expert selection criteria are described but in the context of benchmark construction quality, not as a human subjects study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Not a human subjects experimental study. No participant randomization to conditions."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Not a human subjects experimental study. No blinding procedures are applicable."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Not a human subjects study. While the paper mentions offboarding underperformers among experts, this is workforce management, not participant attrition in a study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No inference costs are reported. Each model is run 8 times on 400 cases (3,200 runs per model, 32,000 total), but the cost of these API calls is not mentioned."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No computational budget is stated. The total API spend, compute hours, or evaluation time is not reported despite running 10 models x 400 cases x 8 runs = 32,000 model inferences plus LM judge evaluations."
    302       }
    303     }
    304   },
    305   "claims": [
    306     {
    307       "claim": "GPT 5 (Thinking = High) is the top performing model on APEX-v1-extended with a mean score of 67.0%.",
    308       "evidence": "Table 2 shows GPT 5 (Thinking = High) at 67.0% overall, followed by Gemini 3 Pro (High) at 64.3% and Grok 4 at 63.5%. Z-scores in Table 3 confirm GPT 5 leads at 0.50 vs. second-place Opus 4.5 at 0.28.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Differences between models' scores are statistically significant overall.",
    313       "evidence": "Friedman omnibus test (p < 0.000001) and paired t-tests with Bonferroni correction showing 35/45 pairwise comparisons significant at p = 0.001 (Section 4.2).",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Frontier models still have substantial limitations when performing typical professional tasks.",
    318       "evidence": "The best model achieves only 67.0% overall, with scores as low as 51.4% for the weakest model. Performance varies across jobs with Investment banking being hardest (top score 63.0%). However, 'substantial limitations' is somewhat vague without a defined threshold.",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "Model ranks are consistent between the heldout benchmark and the open-source devset.",
    323       "evidence": "Table 4 shows ranks are largely preserved between the two sets, with GPT 5 ranked #1 on both and only minor swaps among adjacent models. However, absolute score differences range from 1.2 to 11.9 percentage points.",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "Claude Opus 4.5 (Thinking = On) is particularly strong on the hardest tasks.",
    328       "evidence": "Section 4.2 states Opus 4.5 jumps from 5th to 2nd in z-score ranking, suggesting strong performance relative to task difficulty. However, no formal analysis of performance conditional on task difficulty is presented beyond the z-score reranking.",
    329       "supported": "weak"
    330     }
    331   ],
    332   "methodology_tags": [
    333     "benchmark-eval"
    334   ],
    335   "key_findings": "APEX-v1-extended benchmarks 10 frontier AI models on 400 realistic professional tasks across investment banking, management consulting, law, and medicine. GPT 5 (Thinking = High) leads the leaderboard at 67.0%, with statistically significant differences between most model pairs after Bonferroni correction. Models perform worst on investment banking tasks and best on law tasks. The benchmark reveals that even top frontier models fail to complete roughly one-third of professional task criteria, suggesting meaningful gaps between AI capability and professional work requirements.",
    336   "red_flags": [
    337     {
    338       "flag": "Company evaluating its own product category",
    339       "detail": "Mercor is an AI-powered talent/hiring platform that commercially benefits from the existence and adoption of AI productivity benchmarks. The benchmark is branded and hosted on Mercor's website (mercor.com/apex). This creates a structural conflict of interest where positive benchmark attention benefits the company regardless of which model wins."
    340     },
    341     {
    342       "flag": "No limitations section",
    343       "detail": "The paper contains no limitations, threats to validity, or discussion section whatsoever. It goes directly from Results to Acknowledgments. This is a significant methodological gap for a paper making claims about AI productivity across professional domains."
    344     },
    345     {
    346       "flag": "Single LM judge without human validation",
    347       "detail": "All grading is performed by a single model (Gemini 2.5 Flash). The paper acknowledges switching from a panel of judges to a single judge for efficiency but provides no human-agreement validation for the extended benchmark. LM judge biases could systematically affect the leaderboard rankings."
    348     },
    349     {
    350       "flag": "Overbroad framing vs narrow evidence",
    351       "detail": "The paper is titled 'The AI Productivity Index' suggesting comprehensive measurement of AI economic productivity, but it tests only 4 US-centric professional roles with 100 constructed cases each. The gap between the framing (general AI productivity) and the evidence (specific professional task completion in 4 jobs) is significant."
    352     },
    353     {
    354       "flag": "No model version identifiers",
    355       "detail": "Models are identified only by marketing names without API versions or snapshot dates. Model behavior changes across versions, making these results difficult to reproduce or verify independently."
    356     },
    357     {
    358       "flag": "Closed heldout set prevents independent verification",
    359       "detail": "The primary evaluation data (n=400) is kept private. While this protects against contamination, it also means the leaderboard results cannot be independently verified. Only Mercor can run the benchmark, creating a gatekeeping dynamic."
    360     }
    361   ],
    362   "cited_papers": [
    363     {
    364       "title": "Dynabench: Rethinking Benchmarking in NLP",
    365       "authors": ["Douwe Kiela", "Max Bartolo", "Yixin Nie"],
    366       "year": 2021,
    367       "relevance": "Foundational work on dynamic benchmarking methodology in NLP, relevant to understanding benchmark design principles."
    368     },
    369     {
    370       "title": "GDPVal: Evaluating AI's Economic Value",
    371       "authors": ["OpenAI"],
    372       "year": 2025,
    373       "relevance": "OpenAI's benchmark for evaluating AI's economic value, directly comparable to APEX in measuring real-world AI productivity."
    374     },
    375     {
    376       "title": "Introducing BigLaw Bench: A Public-Facing Dataset for Evaluating LLMs on Legal Tasks",
    377       "authors": ["Harvey Team"],
    378       "year": 2024,
    379       "relevance": "Domain-specific legal benchmark from Harvey AI, relevant as a comparison benchmark for professional AI evaluation."
    380     },
    381     {
    382       "title": "The AI Productivity Index (APEX)",
    383       "authors": ["Bertie Vidgen", "Abby Fennelly", "Evan Pinnix"],
    384       "year": 2025,
    385       "arxiv_id": "2509.25721",
    386       "relevance": "Original APEX-v1.0 paper; this extended version builds on it with expanded data and improved grading."
    387     },
    388     {
    389       "title": "Estimating AI Productivity Gains from Claude Conversations",
    390       "authors": ["Alex Tamkin", "Peter McCrory"],
    391       "year": 2025,
    392       "relevance": "Anthropic's study on AI productivity gains, directly relevant to AI productivity measurement methodology."
    393     },
    394     {
    395       "title": "HealthBench: Evaluating Large Language Models Towards Improved Human Health",
    396       "authors": ["Rahul K. Arora", "Jason Wei"],
    397       "year": 2025,
    398       "relevance": "Domain-specific health/medical LLM benchmark, relevant as a comparable professional-domain evaluation."
    399     },
    400     {
    401       "title": "Reality Check: A New Evaluation Ecosystem is Necessary to Understand AI's Real World Effects",
    402       "authors": ["Reva Schwartz", "Rumman Chowdhury"],
    403       "year": 2025,
    404       "relevance": "Argues for new evaluation approaches to measure real-world AI effects, directly relevant to the AI benchmarking methodology discussion."
    405     },
    406     {
    407       "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research",
    408       "authors": ["Giulio Starace", "Oliver Jaffe"],
    409       "year": 2025,
    410       "relevance": "Benchmark evaluating AI's ability to replicate research, relevant to AI capability evaluation methodology."
    411     },
    412     {
    413       "title": "A Survey on LLM-as-a-Judge",
    414       "authors": ["Jiawei Gu", "Xuhui Jiang"],
    415       "year": 2025,
    416       "relevance": "Survey on LLM-as-judge methodology, directly relevant since APEX uses an LM judge for grading."
    417     },
    418     {
    419       "title": "Miracle or Myth? Assessing the Macroeconomic Productivity Gains from Artificial Intelligence",
    420       "authors": ["Francesco Filippucci", "Peter Gal", "Matthias Schief"],
    421       "year": 2024,
    422       "relevance": "OECD analysis of macroeconomic AI productivity gains, relevant to the broader AI productivity measurement context."
    423     },
    424     {
    425       "title": "Toward an Evaluation Science for Generative AI Systems",
    426       "authors": ["Laura Weidinger", "Inioluwa Deborah Raji"],
    427       "year": 2025,
    428       "relevance": "Proposes evaluation science for generative AI, relevant to benchmark design methodology and evaluation rigor."
    429     },
    430     {
    431       "title": "LMUnit: Fine-grained Evaluation with Natural Language Unit Tests",
    432       "authors": ["Jon Saad-Falcon", "Rajan Vivek"],
    433       "year": 2024,
    434       "relevance": "Proposes fine-grained evaluation using natural language unit tests, directly relevant to APEX's rubric-based criterion approach."
    435     }
    436   ]
    437 }

Impressum · Datenschutz