transcript-analysis.sh - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

transcript-analysis.sh (5019B)
      1 #!/usr/bin/env bash
      2 # Transcript analysis - measures agent efficiency from the conversation log.
      3 # Extracts behavioral metrics from how the agent worked, not what it produced.
      4 #
      5 # Usage: transcript-analysis.sh <run_dir>
      6 # Output: JSON to stdout
      7 
      8 RUN_DIR="$1"
      9 TRANSCRIPT="$RUN_DIR/transcript.jsonl"
     10 
     11 if [ ! -f "$TRANSCRIPT" ]; then
     12   echo '{"error": "no transcript found", "score": 0}'
     13   exit 0
     14 fi
     15 
     16 # --- Count events by type ---
     17 total_events=$(wc -l < "$TRANSCRIPT")
     18 assistant_events=$(jq -r 'select(.type == "assistant")' "$TRANSCRIPT" 2>/dev/null | wc -l)
     19 
     20 # --- Count tool usage ---
     21 tool_calls=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | .name' "$TRANSCRIPT" 2>/dev/null)
     22 total_tools=$(echo "$tool_calls" | grep -c . 2>/dev/null || echo "0")
     23 bash_calls=$(echo "$tool_calls" | grep -c "^Bash$" 2>/dev/null || echo "0")
     24 write_calls=$(echo "$tool_calls" | grep -c "^Write$" 2>/dev/null || echo "0")
     25 edit_calls=$(echo "$tool_calls" | grep -c "^Edit$" 2>/dev/null || echo "0")
     26 read_calls=$(echo "$tool_calls" | grep -c "^Read$" 2>/dev/null || echo "0")
     27 
     28 # --- Detect wasted turns ---
     29 # File writes that are documentation, not code
     30 wasted_writes=0
     31 doc_patterns='README|IMPLEMENTATION|FEATURES|QUICK_START|CHANGELOG|TODO|\.txt'
     32 wasted_writes=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE "cat >.*($doc_patterns)" 2>/dev/null || echo "0")
     33 
     34 # Turns spent printing ASCII art or decorative output
     35 ascii_art=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE '(cat <<|echo).*[═╔╗╚╝║▓░█✓✅🎮]' 2>/dev/null || echo "0")
     36 
     37 # Turns spent starting a server (unnecessary for static games)
     38 server_starts=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'node server|npm start|npx serve|http-server|python.*http' 2>/dev/null || echo "0")
     39 
     40 total_wasted=$((wasted_writes + ascii_art + server_starts))
     41 
     42 # --- Detect error-fix cycles ---
     43 # Count tool results with non-empty stderr or error indicators
     44 errors=$(jq -r 'select(.type == "user") | .tool_use_result | if type == "object" then (.stderr // "") else "" end' "$TRANSCRIPT" 2>/dev/null | grep -c . 2>/dev/null || echo "0")
     45 
     46 # --- Thinking blocks ---
     47 thinking_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "thinking")' "$TRANSCRIPT" 2>/dev/null | wc -l)
     48 
     49 # --- Text output blocks ---
     50 text_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "text")' "$TRANSCRIPT" 2>/dev/null | wc -l)
     51 
     52 # --- Productivity ratio ---
     53 productive_tools=$((total_tools - total_wasted))
     54 if [ "$total_tools" -gt 0 ]; then
     55   productivity_ratio=$(awk "BEGIN {printf \"%.2f\", $productive_tools / $total_tools}")
     56 else
     57   productivity_ratio="0"
     58 fi
     59 
     60 # --- Self-testing ---
     61 # Did the agent try to test its own code?
     62 self_test=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'npm test|npx.*test|node.*test|tsc --noEmit|eslint' 2>/dev/null || echo "0")
     63 
     64 # --- Build results ---
     65 results=$(jq -n \
     66   --argjson total_events "$total_events" \
     67   --argjson total_tools "$total_tools" \
     68   --argjson bash "$bash_calls" \
     69   --argjson write "$write_calls" \
     70   --argjson edit "$edit_calls" \
     71   --argjson read "$read_calls" \
     72   --argjson wasted "$total_wasted" \
     73   --argjson wasted_docs "$wasted_writes" \
     74   --argjson wasted_ascii "$ascii_art" \
     75   --argjson wasted_server "$server_starts" \
     76   --argjson errors "$errors" \
     77   --argjson thinking "$thinking_blocks" \
     78   --argjson text "$text_blocks" \
     79   --arg productivity "$productivity_ratio" \
     80   --argjson self_test "$self_test" \
     81   '{
     82     total_events: $total_events,
     83     tool_calls: {total: $total_tools, bash: $bash, write: $write, edit: $edit, read: $read},
     84     wasted_turns: {total: $wasted, docs: $wasted_docs, ascii_art: $wasted_ascii, server_starts: $wasted_server},
     85     errors_encountered: $errors,
     86     thinking_blocks: $thinking,
     87     text_blocks: $text,
     88     productivity_ratio: ($productivity | tonumber),
     89     self_tested: ($self_test > 0)
     90   }')
     91 
     92 # --- Score ---
     93 # High productivity ratio = good, low wasted turns = good, self-testing = bonus
     94 score=100
     95 
     96 # Penalty for wasted turns (5 points each, max 25)
     97 waste_penalty=$((total_wasted * 5))
     98 [ "$waste_penalty" -gt 25 ] && waste_penalty=25
     99 score=$((score - waste_penalty))
    100 
    101 # Bonus for self-testing
    102 if [ "$self_test" -gt 0 ]; then
    103   score=$((score + 10))
    104 fi
    105 [ "$score" -gt 100 ] && score=100
    106 
    107 score_normalized=$(awk "BEGIN {printf \"%.2f\", $score / 100}")
    108 
    109 results=$(echo "$results" | jq --argjson s "$score_normalized" '. + {score: $s}')
    110 
    111 echo "$results" | jq '.'
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README