/** * Phase 0 PoC: Visual Agent 端到端验真 * * 目标:验证 "截屏 -> 火山 VLM -> 解析 -> nut.js 点击 + 粘贴 -> 发送" 这条链路 * 在微信 4.x 文件传输助手里能不能跑通,跑 N 次统计成功率。 * * 不依赖 neta backend。所有依赖直接通过 npm 安装到 backend node_modules。 * 数据库凭据从 netaclaw_model_channel id=2 直接读(也支持 .env 覆盖)。 * * 用法: * 1) 打开 PC 微信 + 登录 + 打开"文件传输助手"对话(主窗口可见,不要最小化) * 2) cd 到仓库根目录,跑: * pnpm --filter @neta/backend exec tsx ../../tools/visual_agent_probe/run-once.ts [N] * N = 跑几次(默认 3,完整门禁验证应跑 20) * 3) 等待运行,期间不要碰键盘鼠标 * 4) 完成后看微信"文件传输助手"是否收到 N 条 [probe-i-ts] 消息 * 5) 控制台会打印逐次结果 + 总成功率 * * 注意:全程不要切窗口/碰鼠标键盘。脚本每条间隔 3 秒。 */ /* eslint-disable @typescript-eslint/no-explicit-any */ import * as fs from 'node:fs'; import * as path from 'node:path'; // 仓库根路径(脚本在 tools/visual_agent_probe/ 下) const REPO_ROOT = path.resolve(__dirname, '..', '..'); // 让 require 能从 backend node_modules 找到依赖 require('module').Module._initPaths(); const backendNodeModules = path.join(REPO_ROOT, 'packages', 'backend', 'node_modules'); if (fs.existsSync(backendNodeModules)) { (require('module').Module as any).globalPaths.unshift(backendNodeModules); } // ===== 配置 ===== const ARGS = process.argv.slice(2); const N_RUNS = Number(ARGS[0] || 3); // 火山引擎凭据:优先 .env,否则用 model_channel id=2 的硬编码值 const ARK_API_KEY = process.env.ARK_API_KEY || 'bb264a23-eb57-4c46-92e0-14339857e537'; const ARK_BASE_URL = process.env.ARK_BASE_URL || 'https://ark.cn-beijing.volces.com/api/v3'; const MODEL_NAME = process.env.MODEL_NAME || 'doubao-seed-2-0-pro-260215'; const TARGET_ROOM = '文件传输助手'; // ===== Win32 API via koffi ===== let koffi: any; let user32: any; let kernel32: any; function loadKoffi() { const koffiPath = path.join(backendNodeModules, 'koffi'); koffi = require(koffiPath); user32 = koffi.load('user32.dll'); kernel32 = koffi.load('kernel32.dll'); } function ensureDpiAware() { if (process.platform !== 'win32') return; try { const SetProcessDpiAwarenessContext = user32.func('SetProcessDpiAwarenessContext', 'bool', ['intptr']); // DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2 = -4 SetProcessDpiAwarenessContext(-4); } catch (e: any) { console.warn(' [dpi] SetProcessDpiAwarenessContext failed:', e?.message || e); } } interface WinHandle { hwnd: number; pid: number; title: string; bounds: { x: number; y: number; width: number; height: number }; // node-screenshots Window 实例的引用, 用于截图 nsWindow: any; } function findWeixinWindow(): WinHandle | null { if (process.platform !== 'win32') return null; // 用 node-screenshots 自己枚举,选 appName='Weixin' 中面积最大的那个(主窗口) const ssPath = path.join(backendNodeModules, 'node-screenshots'); const screenshots = require(ssPath); const wins = screenshots.Window.all(); let best: any = null; let bestArea = 0; for (const w of wins) { try { const app = w.appName(); if (app !== 'Weixin' && app !== 'WeChat') continue; if (w.isMinimized()) continue; const area = w.width() * w.height(); if (area > bestArea) { bestArea = area; best = w; } } catch { /* skip */ } } if (!best) return null; return { hwnd: best.id(), pid: best.pid(), title: best.title(), bounds: { x: best.x(), y: best.y(), width: best.width(), height: best.height() }, nsWindow: best, }; } function activateWindow(hwnd: number) { const SetForegroundWindow = user32.func('SetForegroundWindow', 'bool', ['intptr']); const ShowWindow = user32.func('ShowWindow', 'bool', ['intptr', 'int']); const BringWindowToTop = user32.func('BringWindowToTop', 'bool', ['intptr']); const IsIconic = user32.func('IsIconic', 'bool', ['intptr']); if (IsIconic(hwnd)) { ShowWindow(hwnd, 9); // SW_RESTORE } BringWindowToTop(hwnd); SetForegroundWindow(hwnd); } // ===== 截屏 ===== // 注意:必须每次截图前重新 enumerate, 否则 node-screenshots Window 对象会返回缓存的旧帧 async function captureWindowPng(_winIgnored: WinHandle): Promise { const fresh = findWeixinWindow(); if (!fresh) throw new Error('weixin window vanished'); const image = fresh.nsWindow.captureImageSync(); return Buffer.from(image.toPngSync()); } // ===== 剪贴板 (用 Windows clip.exe, 避免 clipboardy v5 ESM 问题) ===== function clipboardWrite(text: string): void { const { spawnSync } = require('node:child_process'); // clip.exe 从 stdin 读取,要求 UTF-16 LE BOM 才能正确处理中文 const buf = Buffer.concat([ Buffer.from([0xff, 0xfe]), Buffer.from(text, 'utf16le'), ]); const r = spawnSync('clip.exe', [], { input: buf }); if (r.status !== 0) throw new Error('clip.exe failed: ' + (r.stderr?.toString() || '')); } // ===== 键鼠 (nut.js) ===== async function loadInput() { // 显式从 backend node_modules 加载 nut.js (它是 CJS, 没问题) const nutPath = path.join(backendNodeModules, '@nut-tree-fork', 'nut-js'); const nut = require(nutPath); // 把 nut 默认间隔调短,但保留一点抖动 nut.keyboard.config.autoDelayMs = 30; nut.mouse.config.autoDelayMs = 30; return { nut, clipboardyWrite: clipboardWrite }; } const sleep = (ms: number) => new Promise(r => setTimeout(r, ms)); const jitter = (min: number, max: number) => Math.floor(min + Math.random() * (max - min)); // ===== VLM 调用 ===== interface VlmResponse { raw: string; action: { type: string; x?: number; y?: number; key?: string; text?: string; reason?: string }; promptTokens: number; completionTokens: number; totalTokens: number; latencyMs: number; } async function callVlm(screenshot: Buffer, task: string): Promise { const oaiPath = path.join(backendNodeModules, 'openai'); const OpenAIMod = require(oaiPath); const OpenAI = OpenAIMod.default || OpenAIMod; const client = new OpenAI({ apiKey: ARK_API_KEY, baseURL: ARK_BASE_URL }); const b64 = screenshot.toString('base64'); const t0 = Date.now(); const systemPrompt = `你是一个 Windows 桌面操作助手。 看到的是 PC 微信 4.x 主窗口截图。 你的任务: ${task} 输出严格 JSON, 不要任何解释或代码块标记: {"type":"click","x":,"y":,"reason":"<简短理由>"} 或 {"type":"hotkey","key":"ctrl+v"|"enter"|"ctrl+f"|"ctrl+1","reason":"..."} 或 {"type":"finished","reason":"..."} 或 {"type":"failed","reason":"为何无法完成"} 只输出一个动作 JSON。click 坐标必须在截图范围内, 以截图左上角为原点。`; const resp = await client.chat.completions.create({ model: MODEL_NAME, messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: [ { type: 'text', text: '请给出下一步动作的 JSON。' }, { type: 'image_url', image_url: { url: 'data:image/png;base64,' + b64 } }, ], }, ], temperature: 0.2, max_tokens: 400, }); const latencyMs = Date.now() - t0; const raw = resp.choices?.[0]?.message?.content?.trim() || ''; let action: any = { type: 'failed', reason: 'parse-error' }; try { // 去掉可能的 markdown 代码块 const cleaned = raw.replace(/^```(?:json)?\s*|\s*```$/g, '').trim(); action = JSON.parse(cleaned); } catch (e: any) { action = { type: 'failed', reason: 'json-parse-failed: ' + raw.slice(0, 200) }; } return { raw, action, promptTokens: resp.usage?.prompt_tokens || 0, completionTokens: resp.usage?.completion_tokens || 0, totalTokens: resp.usage?.total_tokens || 0, latencyMs, }; } // ===== 单次 run ===== interface RunResult { index: number; text: string; status: 'success' | 'window-missing' | 'verify-failed' | 'model-failed' | 'action-failed' | 'exception'; steps: number; modelCalls: number; totalTokens: number; durationMs: number; error?: string; trace: Array<{ step: number; action: any; raw?: string }>; } async function runOnce(index: number, input: any, ts: number): Promise { const text = `[probe-${index}-${ts}]`; const trace: any[] = []; let modelCalls = 0; let totalTokens = 0; const t0 = Date.now(); const win = findWeixinWindow(); if (!win) { return { index, text, status: 'window-missing', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: Date.now() - t0, trace, error: 'weixin not running' }; } console.log(` win: hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`); activateWindow(win.hwnd); await sleep(600); // ====================== Phase 0 简化:假设文件传输助手已经被手动打开为当前聊天 ====================== // (不做导航 Ctrl+F 搜索, 因为微信 4.x 全局搜索第一项常常是"公众号"而非"文件传输助手", Enter 会误入) // (Phase A 时再处理导航问题,可能用方向键 + Enter 或 VLM 看屏点击列表项) // 这里只验证 "粘贴 + 发送 + verify" 核心链路是否通 // ====================== Step 4: VLM 验证当前对话就是文件传输助手 ====================== let shot1: Buffer; try { shot1 = await captureWindowPng(win); // ★ DEBUG: 落盘截图 const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug'); fs.mkdirSync(debugDir, { recursive: true }); fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step4-verify-in-chat.png`), shot1); } catch (e: any) { return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'capture failed: ' + (e?.message || e) }; } let vlm: VlmResponse; try { vlm = await callVlm(shot1, `仔细看截图。微信主窗口右侧是当前打开的聊天对话,顶部应该有该聊天的名字。判断当前打开的聊天是否就是"${TARGET_ROOM}"(看右侧上方的标题文字)。如果是, 输出 {"type":"finished","reason":"current chat is ${TARGET_ROOM}"}。如果不是 (打开的是其他聊天 / 公众号面板 / 通讯录), 输出 {"type":"failed","reason":"当前聊天是XX, 不是${TARGET_ROOM}"}。`); modelCalls++; totalTokens += vlm.totalTokens; trace.push({ step: 4, action: vlm.action, raw: vlm.raw }); console.log(` vlm-1 (verify-in-chat) action=${JSON.stringify(vlm.action)} tokens=${vlm.totalTokens} latency=${vlm.latencyMs}ms`); } catch (e: any) { return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: e?.message || String(e) }; } if (vlm.action.type !== 'finished') { return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'not in target chat: ' + vlm.action.reason }; } // ====================== Step 5: 粘贴 + Enter ====================== await input.clipboardyWrite(text); await sleep(150); await input.nut.keyboard.pressKey(input.nut.Key.LeftControl); await input.nut.keyboard.pressKey(input.nut.Key.V); await input.nut.keyboard.releaseKey(input.nut.Key.V); await input.nut.keyboard.releaseKey(input.nut.Key.LeftControl); await sleep(jitter(300, 600)); trace.push({ step: 5, action: { type: 'type', text } }); await input.nut.keyboard.pressKey(input.nut.Key.Enter); await input.nut.keyboard.releaseKey(input.nut.Key.Enter); await sleep(jitter(800, 1300)); trace.push({ step: 6, action: { type: 'hotkey', key: 'enter' } }); // ====================== Step 7: VLM 验证消息已发送 ====================== let shot2: Buffer; try { shot2 = await captureWindowPng(win); // ★ DEBUG: 落盘截图 const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug'); fs.mkdirSync(debugDir, { recursive: true }); fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step7-verify-sent.png`), shot2); } catch (e: any) { return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify capture failed: ' + (e?.message || e) }; } let vlm2: VlmResponse; try { vlm2 = await callVlm(shot2, `仔细看截图。微信聊天窗口右侧底部是最新消息。判断最新一条消息(气泡显示在屏幕右侧的, 即己方发出的)的文本是否包含 "${text}" (注意是字面字符串, 不要看意思)。如果包含, 输出 {"type":"finished","reason":"message visible at bottom"}。否则输出 {"type":"failed","reason":"最底部最新消息内容: <实际看到的文本>"}。`); modelCalls++; totalTokens += vlm2.totalTokens; trace.push({ step: 7, action: vlm2.action, raw: vlm2.raw }); console.log(` vlm-2 (verify-sent) action=${JSON.stringify(vlm2.action)} tokens=${vlm2.totalTokens} latency=${vlm2.latencyMs}ms`); } catch (e: any) { return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify-sent model err: ' + (e?.message || e) }; } if (vlm2.action.type === 'finished') { return { index, text, status: 'success', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace }; } return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'message not visible in chat: ' + vlm2.action.reason }; } // ===== main ===== async function main() { console.log('========================================'); console.log('Visual Agent Phase 0 PoC'); console.log(` model: ${MODEL_NAME}`); console.log(` baseUrl: ${ARK_BASE_URL}`); console.log(` apiKey: ${ARK_API_KEY.slice(0, 12)}...`); console.log(` N runs: ${N_RUNS}`); console.log(` target: ${TARGET_ROOM}`); console.log('========================================'); if (process.platform !== 'win32') { console.error('This probe only runs on Windows.'); process.exit(1); } loadKoffi(); ensureDpiAware(); console.log('DPI Aware set.'); const input = await loadInput(); console.log('nut.js + clipboardy loaded.'); const win = findWeixinWindow(); if (!win) { console.error('WeChat (Weixin / 微信) window not found. Open it and try again.'); process.exit(1); } console.log(`WeChat window found. hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`); console.log('\nStarting in 8 seconds. NOW switch to WeChat and open 文件传输助手. DO NOT touch keyboard/mouse after.\n'); await sleep(8000); const startedAt = Date.now(); const ts = Date.now(); const results: RunResult[] = []; for (let i = 1; i <= N_RUNS; i++) { console.log(`\n--- Run ${i}/${N_RUNS} ---`); let r: RunResult; try { r = await runOnce(i, input, ts); } catch (e: any) { r = { index: i, text: `[probe-${i}-${ts}]`, status: 'exception', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: 0, trace: [], error: e?.message || String(e) }; } results.push(r); console.log(` result: status=${r.status} steps=${r.steps} modelCalls=${r.modelCalls} tokens=${r.totalTokens} dur=${r.durationMs}ms${r.error ? ' err=' + r.error : ''}`); if (i < N_RUNS) { console.log(' cooldown 3s ...'); await sleep(3000); } } const elapsed = Date.now() - startedAt; const successCount = results.filter(r => r.status === 'success').length; const successRate = (successCount / N_RUNS * 100).toFixed(1); const totalTokens = results.reduce((s, r) => s + r.totalTokens, 0); const avgDur = Math.round(results.reduce((s, r) => s + r.durationMs, 0) / N_RUNS); const avgTokens = Math.round(totalTokens / N_RUNS); console.log('\n========================================'); console.log('Summary'); console.log(` total runs: ${N_RUNS}`); console.log(` successes: ${successCount}`); console.log(` success rate: ${successRate}%`); console.log(` total elapsed: ${(elapsed / 1000).toFixed(1)}s`); console.log(` avg duration/run: ${avgDur}ms`); console.log(` total tokens: ${totalTokens}`); console.log(` avg tokens/run: ${avgTokens}`); console.log('========================================'); console.log('Status breakdown:'); const byStatus: Record = {}; for (const r of results) byStatus[r.status] = (byStatus[r.status] || 0) + 1; for (const [k, v] of Object.entries(byStatus)) console.log(` ${k}: ${v}`); console.log('========================================'); // 落盘 JSON 结果 + raw VLM 输出, 供后续 Task 3 fixtures 使用 const reportDir = path.join(REPO_ROOT, 'docs', 'superpowers', 'followups'); fs.mkdirSync(reportDir, { recursive: true }); const reportPath = path.join(reportDir, `2026-05-14-visual-agent-poc-raw.json`); fs.writeFileSync(reportPath, JSON.stringify({ model: MODEL_NAME, baseUrl: ARK_BASE_URL, runCount: N_RUNS, elapsed, successCount, successRate: Number(successRate), avgDurationMs: avgDur, totalTokens, byStatus, results, }, null, 2)); console.log(`Raw report written to ${reportPath}`); console.log('\nNow check WeChat 文件传输助手: you should see ' + N_RUNS + ' messages like [probe-i-' + ts + '].'); console.log('If success rate >= 80%, proceed to Phase A.'); console.log('If < 80%, stop and discuss fallback (UI-TARS / Sonnet vision / different prompt).'); } main().catch(e => { console.error('FATAL:', e?.stack || e); process.exit(1); });