types.ts - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

types.ts (2525B)
      1 /** Shared types for the dashboard. Client-safe -- no Node.js imports. */
      2 
      3 export interface RunMeta {
      4   run_id: string;
      5   cell_id: string;
      6   task: string;
      7   model: string;
      8   effort: string;
      9   prompt_style: string;
     10   language: string;
     11   human_language: string;
     12   tool_read: string;
     13   tool_write: string;
     14   tool_edit: string;
     15   tool_glob: string;
     16   tool_grep: string;
     17   linter: string;
     18   playwright: string;
     19   context_file: string;
     20   sub_agents?: string;
     21   web_search: string;
     22   tests_provided: string;
     23   strategy: string;
     24   design_guidance: string;
     25   architecture: string;
     26   error_checking: string;
     27   context_noise: string;
     28   renderer: string;
     29   provider: string;
     30   actual_model: string;
     31   short_id?: string;
     32   short_cell_id?: string;
     33   max_budget: string;
     34   max_budget_usd: number;
     35   run_number: number;
     36   wall_time_seconds?: number;
     37   exit_code?: number;
     38   started_at?: string;
     39   completed_at?: string;
     40   claude_version?: string;
     41 }
     42 
     43 export interface EvalResults {
     44   structural?: {
     45     pass: boolean;
     46     score: number;
     47     checks: Array<{ name: string; pass: boolean; detail: string }>;
     48   };
     49   functional?: {
     50     pass: boolean;
     51     score: number;
     52     total?: number;
     53     passed?: number;
     54     failed?: number;
     55   };
     56   quality?: {
     57     pass?: boolean;
     58     score: number;
     59     [key: string]: unknown;
     60   };
     61   score: number | null;
     62 }
     63 
     64 export interface ClaudeOutput {
     65   result?: string;
     66   total_cost_usd?: number;
     67   num_turns?: number;
     68   duration_ms?: number;
     69   usage?: {
     70     input_tokens?: number;
     71     output_tokens?: number;
     72   };
     73 }
     74 
     75 export interface Run {
     76   meta: RunMeta;
     77   eval_results: EvalResults | null;
     78   claude_output: ClaudeOutput | null;
     79   has_transcript: boolean;
     80 }
     81 
     82 export type AxisName = keyof Pick<
     83   RunMeta,
     84   | "model"
     85   | "effort"
     86   | "prompt_style"
     87   | "language"
     88   | "human_language"
     89   | "tool_read"
     90   | "tool_write"
     91   | "tool_edit"
     92   | "tool_glob"
     93   | "tool_grep"
     94   | "linter"
     95   | "playwright"
     96   | "context_file"
     97   | "web_search"
     98   | "max_budget"
     99   | "tests_provided"
    100   | "strategy"
    101   | "design_guidance"
    102   | "architecture"
    103   | "error_checking"
    104   | "context_noise"
    105   | "renderer"
    106   | "provider"
    107 >;
    108 
    109 export const AXIS_NAMES: AxisName[] = [
    110   "model",
    111   "effort",
    112   "prompt_style",
    113   "language",
    114   "human_language",
    115   "tool_read",
    116   "tool_write",
    117   "tool_edit",
    118   "tool_glob",
    119   "tool_grep",
    120   "linter",
    121   "playwright",
    122   "context_file",
    123   "web_search",
    124   "max_budget",
    125   "tests_provided",
    126   "strategy",
    127   "design_guidance",
    128   "architecture",
    129   "error_checking",
    130   "context_noise",
    131   "renderer",
    132   "provider",
    133 ];
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README