invoke.sh - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

invoke.sh (4466B)
      1 #!/usr/bin/env bash
      2 # Claude CLI invocation for benchmark runs.
      3 # Maps grid cell configuration to CLI flags and captures output.
      4 
      5 invoke_claude() {
      6   local cell_json="$1"
      7   local workspace="$2"
      8   local run_dir="$3"
      9   local project_dir="$4"
     10 
     11   local model effort task prompt_style human_language context_file
     12   local sub_agents web_search budget timeout_seconds base_tools
     13 
     14   model=$(echo "$cell_json" | jq -r '.model')
     15   effort=$(echo "$cell_json" | jq -r '.effort')
     16   task=$(echo "$cell_json" | jq -r '.task')
     17   prompt_style=$(echo "$cell_json" | jq -r '.prompt_style')
     18   human_language=$(echo "$cell_json" | jq -r '.human_language')
     19   context_file=$(echo "$cell_json" | jq -r '.context_file')
     20   sub_agents=$(echo "$cell_json" | jq -r '.sub_agents')
     21   web_search=$(echo "$cell_json" | jq -r '.web_search')
     22   budget=$(echo "$cell_json" | jq -r '.max_budget_usd')
     23   timeout_seconds=$(echo "$cell_json" | jq -r '.timeout_seconds // 600')
     24   base_tools=$(echo "$cell_json" | jq -r '.base_tools')
     25 
     26   # Select prompt file
     27   local prompt_file="$project_dir/tasks/$task/prompts/${prompt_style}.${human_language}.md"
     28   if [[ ! -f "$prompt_file" ]]; then
     29     echo "ERROR: Prompt file not found: $prompt_file" >&2
     30     return 1
     31   fi
     32   local prompt
     33   prompt=$(<"$prompt_file")
     34 
     35   # Append language instruction
     36   local language
     37   language=$(echo "$cell_json" | jq -r '.language')
     38   if [[ "$language" == "typescript" ]]; then
     39     prompt="$prompt
     40 
     41 Use TypeScript."
     42   elif [[ "$language" == "javascript" ]]; then
     43     prompt="$prompt
     44 
     45 Use JavaScript (no TypeScript)."
     46   fi
     47 
     48   # Build tool list from individual axes (Bash always on)
     49   local tools="Bash"
     50   local tool_read tool_write tool_edit tool_glob tool_grep
     51   tool_read=$(echo "$cell_json" | jq -r '.tool_read // "on"')
     52   tool_write=$(echo "$cell_json" | jq -r '.tool_write // "on"')
     53   tool_edit=$(echo "$cell_json" | jq -r '.tool_edit // "on"')
     54   tool_glob=$(echo "$cell_json" | jq -r '.tool_glob // "on"')
     55   tool_grep=$(echo "$cell_json" | jq -r '.tool_grep // "on"')
     56   [[ "$tool_read" == "on" ]] && tools="$tools,Read"
     57   [[ "$tool_write" == "on" ]] && tools="$tools,Write"
     58   [[ "$tool_edit" == "on" ]] && tools="$tools,Edit"
     59   [[ "$tool_glob" == "on" ]] && tools="$tools,Glob"
     60   [[ "$tool_grep" == "on" ]] && tools="$tools,Grep"
     61   if [[ "$sub_agents" == "on" ]]; then
     62     tools="$tools,Agent"
     63   fi
     64   if [[ "$web_search" == "on" ]]; then
     65     tools="$tools,WebSearch,WebFetch"
     66   fi
     67 
     68   # Build the claude command
     69   # --bare for full isolation (no CLAUDE.md, hooks, MCP, memory).
     70   # Auth via apiKeyHelper that reads OAuth token from ~/.claude/.credentials.json.
     71   local auth_helper
     72   auth_helper="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/get-oauth-token.sh"
     73 
     74   local cmd=(
     75     claude
     76     --bare
     77     -p "$prompt"
     78     --model "$model"
     79     --output-format stream-json
     80     --verbose
     81     --permission-mode dontAsk
     82     --max-budget-usd "$budget"
     83     --allowedTools "$tools"
     84     --settings "{\"apiKeyHelper\": \"$auth_helper\"}"
     85   )
     86 
     87   # Add effort level
     88   if [[ -n "$effort" ]] && [[ "$effort" != "null" ]]; then
     89     cmd+=(--effort "$effort")
     90   fi
     91 
     92   # Add context file as system prompt if provided
     93   if [[ "$context_file" == "provided" ]]; then
     94     local ctx_file="$project_dir/tasks/$task/context.md"
     95     if [[ -f "$ctx_file" ]]; then
     96       cmd+=(--append-system-prompt "$(cat "$ctx_file")")
     97     fi
     98   fi
     99 
    100   # Record start time
    101   local start_time
    102   start_time=$(date +%s)
    103 
    104   # Run claude in the workspace directory
    105   cd "$workspace" || exit 1
    106 
    107   local exit_code=0
    108   if timeout "${timeout_seconds}s" "${cmd[@]}" \
    109     > "$run_dir/transcript.jsonl" 2>"$run_dir/claude_stderr.log"; then
    110     exit_code=0
    111   else
    112     exit_code=$?
    113   fi
    114 
    115   local end_time
    116   end_time=$(date +%s)
    117   local wall_time=$((end_time - start_time))
    118 
    119   # Extract the final result message from the stream
    120   # The last JSON object with type "result" contains the summary metrics
    121   if [[ -f "$run_dir/transcript.jsonl" ]]; then
    122     tail -1 "$run_dir/transcript.jsonl" > "$run_dir/claude_output.json" 2>/dev/null || true
    123   fi
    124 
    125   # Update meta.json with timing info
    126   local meta_file="$run_dir/meta.json"
    127   if [[ -f "$meta_file" ]]; then
    128     local tmp
    129     tmp=$(jq \
    130       --argjson wall "$wall_time" \
    131       --argjson exit_code "$exit_code" \
    132       --arg completed "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
    133       '. + {wall_time_seconds: $wall, exit_code: $exit_code, completed_at: $completed}' \
    134       "$meta_file")
    135     echo "$tmp" > "$meta_file"
    136   fi
    137 
    138   return $exit_code
    139 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README