ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

deep-eval.schema.json (2913B)


      1 {
      2   "$schema": "https://json-schema.org/draft/2020-12/schema",
      3   "$id": "deep-eval.schema.json",
      4   "title": "Deep Evaluation Result",
      5   "description": "Schema for optional deep evaluation of a paper. Produced by the deep-eval agent for papers selected for closer scrutiny.",
      6   "type": "object",
      7   "required": ["paper_id", "code_runs", "results_reproduce", "benchmark_contamination_check", "additional_findings"],
      8   "properties": {
      9     "paper_id": {
     10       "type": "string",
     11       "description": "Registry ID of the paper being evaluated."
     12     },
     13     "code_runs": {
     14       "type": "object",
     15       "description": "Whether the released code runs successfully.",
     16       "required": ["attempted", "success", "details"],
     17       "properties": {
     18         "attempted": {
     19           "type": "boolean",
     20           "description": "Whether code execution was attempted (false if no code released)."
     21         },
     22         "success": {
     23           "type": ["boolean", "null"],
     24           "description": "Whether the code ran successfully. Null if not attempted."
     25         },
     26         "details": {
     27           "type": "string",
     28           "description": "Description of what happened: environment setup, errors encountered, workarounds needed."
     29         }
     30       }
     31     },
     32     "results_reproduce": {
     33       "type": "object",
     34       "description": "Whether key results from the paper can be reproduced.",
     35       "required": ["attempted", "success", "details"],
     36       "properties": {
     37         "attempted": {
     38           "type": "boolean",
     39           "description": "Whether reproduction was attempted."
     40         },
     41         "success": {
     42           "type": ["boolean", "null"],
     43           "description": "Whether results were reproduced within reasonable tolerance. Null if not attempted."
     44         },
     45         "details": {
     46           "type": "string",
     47           "description": "What was attempted, what matched, what diverged, and by how much."
     48         }
     49       }
     50     },
     51     "benchmark_contamination_check": {
     52       "type": "object",
     53       "description": "Check for potential benchmark contamination or data leakage.",
     54       "required": ["checked", "concerns"],
     55       "properties": {
     56         "checked": {
     57           "type": "boolean",
     58           "description": "Whether contamination was checked."
     59         },
     60         "concerns": {
     61           "type": "string",
     62           "description": "Any contamination concerns found, or 'none' if clean."
     63         }
     64       }
     65     },
     66     "additional_findings": {
     67       "type": "array",
     68       "description": "Any other notable findings from deep evaluation.",
     69       "items": {
     70         "type": "object",
     71         "required": ["finding", "detail"],
     72         "properties": {
     73           "finding": {
     74             "type": "string",
     75             "description": "Short label for the finding."
     76           },
     77           "detail": {
     78             "type": "string",
     79             "description": "Detailed explanation."
     80           }
     81         }
     82       }
     83     }
     84   }
     85 }

Impressum · Datenschutz