transcript-analysis.sh (5019B)
1 #!/usr/bin/env bash 2 # Transcript analysis - measures agent efficiency from the conversation log. 3 # Extracts behavioral metrics from how the agent worked, not what it produced. 4 # 5 # Usage: transcript-analysis.sh <run_dir> 6 # Output: JSON to stdout 7 8 RUN_DIR="$1" 9 TRANSCRIPT="$RUN_DIR/transcript.jsonl" 10 11 if [ ! -f "$TRANSCRIPT" ]; then 12 echo '{"error": "no transcript found", "score": 0}' 13 exit 0 14 fi 15 16 # --- Count events by type --- 17 total_events=$(wc -l < "$TRANSCRIPT") 18 assistant_events=$(jq -r 'select(.type == "assistant")' "$TRANSCRIPT" 2>/dev/null | wc -l) 19 20 # --- Count tool usage --- 21 tool_calls=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | .name' "$TRANSCRIPT" 2>/dev/null) 22 total_tools=$(echo "$tool_calls" | grep -c . 2>/dev/null || echo "0") 23 bash_calls=$(echo "$tool_calls" | grep -c "^Bash$" 2>/dev/null || echo "0") 24 write_calls=$(echo "$tool_calls" | grep -c "^Write$" 2>/dev/null || echo "0") 25 edit_calls=$(echo "$tool_calls" | grep -c "^Edit$" 2>/dev/null || echo "0") 26 read_calls=$(echo "$tool_calls" | grep -c "^Read$" 2>/dev/null || echo "0") 27 28 # --- Detect wasted turns --- 29 # File writes that are documentation, not code 30 wasted_writes=0 31 doc_patterns='README|IMPLEMENTATION|FEATURES|QUICK_START|CHANGELOG|TODO|\.txt' 32 wasted_writes=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE "cat >.*($doc_patterns)" 2>/dev/null || echo "0") 33 34 # Turns spent printing ASCII art or decorative output 35 ascii_art=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE '(cat <<|echo).*[═╔╗╚╝║▓░█✓✅🎮]' 2>/dev/null || echo "0") 36 37 # Turns spent starting a server (unnecessary for static games) 38 server_starts=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'node server|npm start|npx serve|http-server|python.*http' 2>/dev/null || echo "0") 39 40 total_wasted=$((wasted_writes + ascii_art + server_starts)) 41 42 # --- Detect error-fix cycles --- 43 # Count tool results with non-empty stderr or error indicators 44 errors=$(jq -r 'select(.type == "user") | .tool_use_result | if type == "object" then (.stderr // "") else "" end' "$TRANSCRIPT" 2>/dev/null | grep -c . 2>/dev/null || echo "0") 45 46 # --- Thinking blocks --- 47 thinking_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "thinking")' "$TRANSCRIPT" 2>/dev/null | wc -l) 48 49 # --- Text output blocks --- 50 text_blocks=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "text")' "$TRANSCRIPT" 2>/dev/null | wc -l) 51 52 # --- Productivity ratio --- 53 productive_tools=$((total_tools - total_wasted)) 54 if [ "$total_tools" -gt 0 ]; then 55 productivity_ratio=$(awk "BEGIN {printf \"%.2f\", $productive_tools / $total_tools}") 56 else 57 productivity_ratio="0" 58 fi 59 60 # --- Self-testing --- 61 # Did the agent try to test its own code? 62 self_test=$(jq -r 'select(.type == "assistant") | .message.content[]? | select(.type == "tool_use") | select(.name == "Bash") | .input.command // ""' "$TRANSCRIPT" 2>/dev/null | grep -cE 'npm test|npx.*test|node.*test|tsc --noEmit|eslint' 2>/dev/null || echo "0") 63 64 # --- Build results --- 65 results=$(jq -n \ 66 --argjson total_events "$total_events" \ 67 --argjson total_tools "$total_tools" \ 68 --argjson bash "$bash_calls" \ 69 --argjson write "$write_calls" \ 70 --argjson edit "$edit_calls" \ 71 --argjson read "$read_calls" \ 72 --argjson wasted "$total_wasted" \ 73 --argjson wasted_docs "$wasted_writes" \ 74 --argjson wasted_ascii "$ascii_art" \ 75 --argjson wasted_server "$server_starts" \ 76 --argjson errors "$errors" \ 77 --argjson thinking "$thinking_blocks" \ 78 --argjson text "$text_blocks" \ 79 --arg productivity "$productivity_ratio" \ 80 --argjson self_test "$self_test" \ 81 '{ 82 total_events: $total_events, 83 tool_calls: {total: $total_tools, bash: $bash, write: $write, edit: $edit, read: $read}, 84 wasted_turns: {total: $wasted, docs: $wasted_docs, ascii_art: $wasted_ascii, server_starts: $wasted_server}, 85 errors_encountered: $errors, 86 thinking_blocks: $thinking, 87 text_blocks: $text, 88 productivity_ratio: ($productivity | tonumber), 89 self_tested: ($self_test > 0) 90 }') 91 92 # --- Score --- 93 # High productivity ratio = good, low wasted turns = good, self-testing = bonus 94 score=100 95 96 # Penalty for wasted turns (5 points each, max 25) 97 waste_penalty=$((total_wasted * 5)) 98 [ "$waste_penalty" -gt 25 ] && waste_penalty=25 99 score=$((score - waste_penalty)) 100 101 # Bonus for self-testing 102 if [ "$self_test" -gt 0 ]; then 103 score=$((score + 10)) 104 fi 105 [ "$score" -gt 100 ] && score=100 106 107 score_normalized=$(awk "BEGIN {printf \"%.2f\", $score / 100}") 108 109 results=$(echo "$results" | jq --argjson s "$score_normalized" '. + {score: $s}') 110 111 echo "$results" | jq '.'