{"id":"d801f2af-7547-4d38-907b-258ddcb0fb1a","shortId":"rNRxsY","kind":"skill","title":"Benchmark virtual agents with scripted multi-turn conversations using Agent Evaluation","tagline":"Run concurrent scripted conversations against a target agent to measure whether it stays on task, responds correctly, and holds up in repeatable test cases.","description":"# Benchmark virtual agents with scripted multi-turn conversations using Agent Evaluation\n\nRun concurrent scripted conversations against a target agent to measure whether it stays on task, responds correctly, and holds up in repeatable test cases.\n\n## Prerequisites\n\nPython environment, target agent endpoint or integration, optional AWS services such as Bedrock or SageMaker\n\n## Installation\n\nNo source-backed install or usage instructions could be extracted automatically. Review the upstream project before running this skill in a sensitive workflow.\n\n- Source: https://github.com/awslabs/agent-evaluation\n\n## Documentation\n\n- https://awslabs.github.io/agent-evaluation/\n\n## Source\n\n- [Agent Skill Exchange](https://agentskillexchange.com/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation/)","tags":["benchmark","virtual","agents","with","scripted","multi","turn","conversations","using","agent","evaluation","skills"],"capabilities":["skill","source-agentskillexchange","skill-benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation","topic-agent-skills","topic-ai-agents","topic-ai-tools","topic-awesome-list","topic-claude-code","topic-codex","topic-cursor","topic-llm","topic-mcp","topic-npx-skills","topic-openclaw","topic-skills-catalog"],"categories":["skills"],"synonyms":[],"warnings":[],"endpointUrl":"https://skills.sh/agentskillexchange/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation","protocol":"skill","transport":"skills-sh","auth":{"type":"none","details":{"cli":"npx skills add agentskillexchange/skills","source_repo":"https://github.com/agentskillexchange/skills","install_from":"skills.sh"}},"qualityScore":"0.454","qualityRationale":"deterministic score 0.45 from registry signals: · indexed on github topic:agent-skills · 8 github stars · SKILL.md body (836 chars)","verified":false,"liveness":"unknown","lastLivenessCheck":null,"agentReviews":{"count":0,"score_avg":null,"cost_usd_avg":null,"success_rate":null,"latency_p50_ms":null,"narrative_summary":null,"summary_updated_at":null},"enrichmentModel":"deterministic:skill-github:v1","enrichmentVersion":1,"enrichedAt":"2026-05-18T19:09:37.266Z","embedding":null,"createdAt":"2026-05-18T13:15:24.703Z","updatedAt":"2026-05-18T19:09:37.266Z","lastSeenAt":"2026-05-18T19:09:37.266Z","tsv":"'/agent-evaluation/':121 '/awslabs/agent-evaluation':117 '/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation/)':128 'agent':3,11,20,39,47,56,77,123 'agentskillexchange.com':127 'agentskillexchange.com/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation/)':126 'automat':101 'aw':82 'awslabs.github.io':120 'awslabs.github.io/agent-evaluation/':119 'back':93 'bedrock':86 'benchmark':1,37 'case':36,72 'concurr':14,50 'convers':9,16,45,52 'correct':29,65 'could':98 'document':118 'endpoint':78 'environ':75 'evalu':12,48 'exchang':125 'extract':100 'github.com':116 'github.com/awslabs/agent-evaluation':115 'hold':31,67 'instal':89,94 'instruct':97 'integr':80 'measur':22,58 'multi':7,43 'multi-turn':6,42 'option':81 'prerequisit':73 'project':105 'python':74 'repeat':34,70 'respond':28,64 'review':102 'run':13,49,107 'sagemak':88 'script':5,15,41,51 'sensit':112 'servic':83 'skill':109,124 'skill-benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation' 'sourc':92,114,122 'source-agentskillexchange' 'source-back':91 'stay':25,61 'target':19,55,76 'task':27,63 'test':35,71 'topic-agent-skills' 'topic-ai-agents' 'topic-ai-tools' 'topic-awesome-list' 'topic-claude-code' 'topic-codex' 'topic-cursor' 'topic-llm' 'topic-mcp' 'topic-npx-skills' 'topic-openclaw' 'topic-skills-catalog' 'turn':8,44 'upstream':104 'usag':96 'use':10,46 'virtual':2,38 'whether':23,59 'workflow':113","prices":[{"id":"45623f1f-2ad5-4247-8b85-48486a7687d5","listingId":"d801f2af-7547-4d38-907b-258ddcb0fb1a","amountUsd":"0","unit":"free","nativeCurrency":null,"nativeAmount":null,"chain":null,"payTo":null,"paymentMethod":"skill-free","isPrimary":true,"details":{"org":"agentskillexchange","category":"skills","install_from":"skills.sh"},"createdAt":"2026-05-18T13:15:24.703Z"}],"sources":[{"listingId":"d801f2af-7547-4d38-907b-258ddcb0fb1a","source":"github","sourceId":"agentskillexchange/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation","sourceUrl":"https://github.com/agentskillexchange/skills/tree/main/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation","isPrimary":false,"firstSeenAt":"2026-05-18T13:15:24.703Z","lastSeenAt":"2026-05-18T19:09:37.266Z"}],"details":{"listingId":"d801f2af-7547-4d38-907b-258ddcb0fb1a","quickStartSnippet":null,"exampleRequest":null,"exampleResponse":null,"schema":null,"openapiUrl":null,"agentsTxtUrl":null,"citations":[],"useCases":[],"bestFor":[],"notFor":[],"kindDetails":{"org":"agentskillexchange","slug":"benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation","github":{"repo":"agentskillexchange/skills","stars":8,"topics":["agent-skills","ai-agents","ai-tools","awesome-list","claude-code","codex","cursor","llm","mcp","npx-skills","openclaw","skills-catalog"],"license":"mit","html_url":"https://github.com/agentskillexchange/skills","pushed_at":"2026-05-18T19:02:17Z","description":"The open catalog of AI agent skills — 2,000+ security-scanned skills for Claude Code, Cursor, Codex, and more.","skill_md_sha":"d483583ea40a4d32d8e5da6cde195fd169f704fe","skill_md_path":"skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation/SKILL.md","default_branch":"main","skill_tree_url":"https://github.com/agentskillexchange/skills/tree/main/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation"},"layout":"multi","source":"github","category":"skills","frontmatter":{"name":"Benchmark virtual agents with scripted multi-turn conversations using Agent Evaluation","description":"Run concurrent scripted conversations against a target agent to measure whether it stays on task, responds correctly, and holds up in repeatable test cases."},"skills_sh_url":"https://skills.sh/agentskillexchange/skills/benchmark-virtual-agents-with-scripted-multi-turn-conversations-using-agent-evaluation"},"updatedAt":"2026-05-18T19:09:37.266Z"}}