transcript-analysis.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

transcript-analysis.py (3853B)
      1 #!/usr/bin/env python3
      2 """Transcript analysis - measures agent efficiency from the conversation log."""
      3 
      4 import json
      5 import re
      6 import sys
      7 from pathlib import Path
      8 
      9 
     10 def main():
     11     run_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
     12     transcript = run_dir / "transcript.jsonl"
     13 
     14     if not transcript.exists():
     15         print(json.dumps({"error": "no transcript found", "score": 0}))
     16         return
     17 
     18     events = []
     19     for line in transcript.read_text().strip().split("\n"):
     20         if line.strip():
     21             try:
     22                 events.append(json.loads(line))
     23             except json.JSONDecodeError:
     24                 pass
     25 
     26     # Count tool calls by type
     27     tool_calls = []
     28     for ev in events:
     29         if ev.get("type") == "assistant" and ev.get("message", {}).get("content"):
     30             for block in ev["message"]["content"]:
     31                 if block.get("type") == "tool_use":
     32                     tool_calls.append(block)
     33 
     34     tool_names = [t.get("name", "") for t in tool_calls]
     35     bash_commands = []
     36     for t in tool_calls:
     37         if t.get("name") == "Bash":
     38             cmd = t.get("input", {}).get("command", "")
     39             bash_commands.append(cmd)
     40 
     41     # Count wasted turns
     42     doc_patterns = re.compile(r"cat >.*?(README|IMPLEMENTATION|FEATURES|QUICK_START|CHANGELOG|TODO|\.txt)", re.I)
     43     ascii_patterns = re.compile(r"(cat <<|echo).*[═╔╗╚╝║▓░█✓✅🎮]")
     44     server_patterns = re.compile(r"node server|npm start|npx serve|http-server|python.*http")
     45     test_patterns = re.compile(r"npm test|npx.*test|node.*test|tsc --noEmit|eslint")
     46 
     47     wasted_docs = sum(1 for c in bash_commands if doc_patterns.search(c))
     48     wasted_ascii = sum(1 for c in bash_commands if ascii_patterns.search(c))
     49     wasted_server = sum(1 for c in bash_commands if server_patterns.search(c))
     50     self_tested = sum(1 for c in bash_commands if test_patterns.search(c))
     51 
     52     total_wasted = wasted_docs + wasted_ascii + wasted_server
     53 
     54     # Count errors in tool results
     55     errors = 0
     56     for ev in events:
     57         if ev.get("type") == "user":
     58             result = ev.get("tool_use_result")
     59             if isinstance(result, dict) and result.get("stderr"):
     60                 errors += 1
     61 
     62     # Count thinking and text blocks
     63     thinking_blocks = 0
     64     text_blocks = 0
     65     for ev in events:
     66         if ev.get("type") == "assistant" and ev.get("message", {}).get("content"):
     67             for block in ev["message"]["content"]:
     68                 if block.get("type") == "thinking":
     69                     thinking_blocks += 1
     70                 elif block.get("type") == "text":
     71                     text_blocks += 1
     72 
     73     # Productivity ratio
     74     total_tools = len(tool_calls)
     75     productive = total_tools - total_wasted
     76     productivity_ratio = round(productive / total_tools, 2) if total_tools > 0 else 0
     77 
     78     # Score
     79     score = 100
     80     waste_penalty = min(total_wasted * 5, 25)
     81     score -= waste_penalty
     82     if self_tested > 0:
     83         score = min(score + 10, 100)
     84     score_normalized = round(score / 100, 2)
     85 
     86     result = {
     87         "total_events": len(events),
     88         "tool_calls": {
     89             "total": total_tools,
     90             "bash": tool_names.count("Bash"),
     91             "write": tool_names.count("Write"),
     92             "edit": tool_names.count("Edit"),
     93             "read": tool_names.count("Read"),
     94         },
     95         "wasted_turns": {
     96             "total": total_wasted,
     97             "docs": wasted_docs,
     98             "ascii_art": wasted_ascii,
     99             "server_starts": wasted_server,
    100         },
    101         "errors_encountered": errors,
    102         "thinking_blocks": thinking_blocks,
    103         "text_blocks": text_blocks,
    104         "productivity_ratio": productivity_ratio,
    105         "self_tested": self_tested > 0,
    106         "score": score_normalized,
    107     }
    108 
    109     print(json.dumps(result, indent=2))
    110 
    111 
    112 if __name__ == "__main__":
    113     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README