432 lines
18 KiB
TypeScript
432 lines
18 KiB
TypeScript
/**
|
|
* Phase 0 PoC: Visual Agent 端到端验真
|
|
*
|
|
* 目标:验证 "截屏 -> 火山 VLM -> 解析 -> nut.js 点击 + 粘贴 -> 发送" 这条链路
|
|
* 在微信 4.x 文件传输助手里能不能跑通,跑 N 次统计成功率。
|
|
*
|
|
* 不依赖 neta backend。所有依赖直接通过 npm 安装到 backend node_modules。
|
|
* 数据库凭据从 netaclaw_model_channel id=2 直接读(也支持 .env 覆盖)。
|
|
*
|
|
* 用法:
|
|
* 1) 打开 PC 微信 + 登录 + 打开"文件传输助手"对话(主窗口可见,不要最小化)
|
|
* 2) cd 到仓库根目录,跑:
|
|
* pnpm --filter @neta/backend exec tsx ../../tools/visual_agent_probe/run-once.ts [N]
|
|
* N = 跑几次(默认 3,完整门禁验证应跑 20)
|
|
* 3) 等待运行,期间不要碰键盘鼠标
|
|
* 4) 完成后看微信"文件传输助手"是否收到 N 条 [probe-i-ts] 消息
|
|
* 5) 控制台会打印逐次结果 + 总成功率
|
|
*
|
|
* 注意:全程不要切窗口/碰鼠标键盘。脚本每条间隔 3 秒。
|
|
*/
|
|
|
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
|
|
import * as fs from 'node:fs';
|
|
import * as path from 'node:path';
|
|
|
|
// 仓库根路径(脚本在 tools/visual_agent_probe/ 下)
|
|
const REPO_ROOT = path.resolve(__dirname, '..', '..');
|
|
|
|
// 让 require 能从 backend node_modules 找到依赖
|
|
require('module').Module._initPaths();
|
|
const backendNodeModules = path.join(REPO_ROOT, 'packages', 'backend', 'node_modules');
|
|
if (fs.existsSync(backendNodeModules)) {
|
|
(require('module').Module as any).globalPaths.unshift(backendNodeModules);
|
|
}
|
|
|
|
// ===== 配置 =====
|
|
const ARGS = process.argv.slice(2);
|
|
const N_RUNS = Number(ARGS[0] || 3);
|
|
|
|
// 火山引擎凭据:优先 .env,否则用 model_channel id=2 的硬编码值
|
|
const ARK_API_KEY = process.env.ARK_API_KEY || 'bb264a23-eb57-4c46-92e0-14339857e537';
|
|
const ARK_BASE_URL = process.env.ARK_BASE_URL || 'https://ark.cn-beijing.volces.com/api/v3';
|
|
const MODEL_NAME = process.env.MODEL_NAME || 'doubao-seed-2-0-pro-260215';
|
|
|
|
const TARGET_ROOM = '文件传输助手';
|
|
|
|
// ===== Win32 API via koffi =====
|
|
let koffi: any;
|
|
let user32: any;
|
|
let kernel32: any;
|
|
|
|
function loadKoffi() {
|
|
const koffiPath = path.join(backendNodeModules, 'koffi');
|
|
koffi = require(koffiPath);
|
|
user32 = koffi.load('user32.dll');
|
|
kernel32 = koffi.load('kernel32.dll');
|
|
}
|
|
|
|
function ensureDpiAware() {
|
|
if (process.platform !== 'win32') return;
|
|
try {
|
|
const SetProcessDpiAwarenessContext = user32.func('SetProcessDpiAwarenessContext', 'bool', ['intptr']);
|
|
// DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2 = -4
|
|
SetProcessDpiAwarenessContext(-4);
|
|
} catch (e: any) {
|
|
console.warn(' [dpi] SetProcessDpiAwarenessContext failed:', e?.message || e);
|
|
}
|
|
}
|
|
|
|
interface WinHandle {
|
|
hwnd: number;
|
|
pid: number;
|
|
title: string;
|
|
bounds: { x: number; y: number; width: number; height: number };
|
|
// node-screenshots Window 实例的引用, 用于截图
|
|
nsWindow: any;
|
|
}
|
|
|
|
function findWeixinWindow(): WinHandle | null {
|
|
if (process.platform !== 'win32') return null;
|
|
// 用 node-screenshots 自己枚举,选 appName='Weixin' 中面积最大的那个(主窗口)
|
|
const ssPath = path.join(backendNodeModules, 'node-screenshots');
|
|
const screenshots = require(ssPath);
|
|
const wins = screenshots.Window.all();
|
|
let best: any = null;
|
|
let bestArea = 0;
|
|
for (const w of wins) {
|
|
try {
|
|
const app = w.appName();
|
|
if (app !== 'Weixin' && app !== 'WeChat') continue;
|
|
if (w.isMinimized()) continue;
|
|
const area = w.width() * w.height();
|
|
if (area > bestArea) {
|
|
bestArea = area;
|
|
best = w;
|
|
}
|
|
} catch { /* skip */ }
|
|
}
|
|
if (!best) return null;
|
|
return {
|
|
hwnd: best.id(),
|
|
pid: best.pid(),
|
|
title: best.title(),
|
|
bounds: { x: best.x(), y: best.y(), width: best.width(), height: best.height() },
|
|
nsWindow: best,
|
|
};
|
|
}
|
|
|
|
function activateWindow(hwnd: number) {
|
|
const SetForegroundWindow = user32.func('SetForegroundWindow', 'bool', ['intptr']);
|
|
const ShowWindow = user32.func('ShowWindow', 'bool', ['intptr', 'int']);
|
|
const BringWindowToTop = user32.func('BringWindowToTop', 'bool', ['intptr']);
|
|
const IsIconic = user32.func('IsIconic', 'bool', ['intptr']);
|
|
if (IsIconic(hwnd)) {
|
|
ShowWindow(hwnd, 9); // SW_RESTORE
|
|
}
|
|
BringWindowToTop(hwnd);
|
|
SetForegroundWindow(hwnd);
|
|
}
|
|
|
|
// ===== 截屏 =====
|
|
// 注意:必须每次截图前重新 enumerate, 否则 node-screenshots Window 对象会返回缓存的旧帧
|
|
async function captureWindowPng(_winIgnored: WinHandle): Promise<Buffer> {
|
|
const fresh = findWeixinWindow();
|
|
if (!fresh) throw new Error('weixin window vanished');
|
|
const image = fresh.nsWindow.captureImageSync();
|
|
return Buffer.from(image.toPngSync());
|
|
}
|
|
|
|
// ===== 剪贴板 (用 Windows clip.exe, 避免 clipboardy v5 ESM 问题) =====
|
|
function clipboardWrite(text: string): void {
|
|
const { spawnSync } = require('node:child_process');
|
|
// clip.exe 从 stdin 读取,要求 UTF-16 LE BOM 才能正确处理中文
|
|
const buf = Buffer.concat([
|
|
Buffer.from([0xff, 0xfe]),
|
|
Buffer.from(text, 'utf16le'),
|
|
]);
|
|
const r = spawnSync('clip.exe', [], { input: buf });
|
|
if (r.status !== 0) throw new Error('clip.exe failed: ' + (r.stderr?.toString() || ''));
|
|
}
|
|
|
|
// ===== 键鼠 (nut.js) =====
|
|
async function loadInput() {
|
|
// 显式从 backend node_modules 加载 nut.js (它是 CJS, 没问题)
|
|
const nutPath = path.join(backendNodeModules, '@nut-tree-fork', 'nut-js');
|
|
const nut = require(nutPath);
|
|
// 把 nut 默认间隔调短,但保留一点抖动
|
|
nut.keyboard.config.autoDelayMs = 30;
|
|
nut.mouse.config.autoDelayMs = 30;
|
|
return { nut, clipboardyWrite: clipboardWrite };
|
|
}
|
|
|
|
const sleep = (ms: number) => new Promise(r => setTimeout(r, ms));
|
|
const jitter = (min: number, max: number) => Math.floor(min + Math.random() * (max - min));
|
|
|
|
// ===== VLM 调用 =====
|
|
interface VlmResponse {
|
|
raw: string;
|
|
action: { type: string; x?: number; y?: number; key?: string; text?: string; reason?: string };
|
|
promptTokens: number;
|
|
completionTokens: number;
|
|
totalTokens: number;
|
|
latencyMs: number;
|
|
}
|
|
|
|
async function callVlm(screenshot: Buffer, task: string): Promise<VlmResponse> {
|
|
const oaiPath = path.join(backendNodeModules, 'openai');
|
|
const OpenAIMod = require(oaiPath);
|
|
const OpenAI = OpenAIMod.default || OpenAIMod;
|
|
const client = new OpenAI({ apiKey: ARK_API_KEY, baseURL: ARK_BASE_URL });
|
|
|
|
const b64 = screenshot.toString('base64');
|
|
const t0 = Date.now();
|
|
|
|
const systemPrompt = `你是一个 Windows 桌面操作助手。
|
|
看到的是 PC 微信 4.x 主窗口截图。
|
|
你的任务: ${task}
|
|
|
|
输出严格 JSON, 不要任何解释或代码块标记:
|
|
{"type":"click","x":<int>,"y":<int>,"reason":"<简短理由>"}
|
|
或 {"type":"hotkey","key":"ctrl+v"|"enter"|"ctrl+f"|"ctrl+1","reason":"..."}
|
|
或 {"type":"finished","reason":"..."}
|
|
或 {"type":"failed","reason":"为何无法完成"}
|
|
|
|
只输出一个动作 JSON。click 坐标必须在截图范围内, 以截图左上角为原点。`;
|
|
|
|
const resp = await client.chat.completions.create({
|
|
model: MODEL_NAME,
|
|
messages: [
|
|
{ role: 'system', content: systemPrompt },
|
|
{
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text: '请给出下一步动作的 JSON。' },
|
|
{ type: 'image_url', image_url: { url: 'data:image/png;base64,' + b64 } },
|
|
],
|
|
},
|
|
],
|
|
temperature: 0.2,
|
|
max_tokens: 400,
|
|
});
|
|
|
|
const latencyMs = Date.now() - t0;
|
|
const raw = resp.choices?.[0]?.message?.content?.trim() || '';
|
|
let action: any = { type: 'failed', reason: 'parse-error' };
|
|
try {
|
|
// 去掉可能的 markdown 代码块
|
|
const cleaned = raw.replace(/^```(?:json)?\s*|\s*```$/g, '').trim();
|
|
action = JSON.parse(cleaned);
|
|
} catch (e: any) {
|
|
action = { type: 'failed', reason: 'json-parse-failed: ' + raw.slice(0, 200) };
|
|
}
|
|
|
|
return {
|
|
raw,
|
|
action,
|
|
promptTokens: resp.usage?.prompt_tokens || 0,
|
|
completionTokens: resp.usage?.completion_tokens || 0,
|
|
totalTokens: resp.usage?.total_tokens || 0,
|
|
latencyMs,
|
|
};
|
|
}
|
|
|
|
// ===== 单次 run =====
|
|
interface RunResult {
|
|
index: number;
|
|
text: string;
|
|
status: 'success' | 'window-missing' | 'verify-failed' | 'model-failed' | 'action-failed' | 'exception';
|
|
steps: number;
|
|
modelCalls: number;
|
|
totalTokens: number;
|
|
durationMs: number;
|
|
error?: string;
|
|
trace: Array<{ step: number; action: any; raw?: string }>;
|
|
}
|
|
|
|
async function runOnce(index: number, input: any, ts: number): Promise<RunResult> {
|
|
const text = `[probe-${index}-${ts}]`;
|
|
const trace: any[] = [];
|
|
let modelCalls = 0;
|
|
let totalTokens = 0;
|
|
const t0 = Date.now();
|
|
|
|
const win = findWeixinWindow();
|
|
if (!win) {
|
|
return { index, text, status: 'window-missing', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: Date.now() - t0, trace, error: 'weixin not running' };
|
|
}
|
|
console.log(` win: hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);
|
|
activateWindow(win.hwnd);
|
|
await sleep(600);
|
|
|
|
// ====================== Phase 0 简化:假设文件传输助手已经被手动打开为当前聊天 ======================
|
|
// (不做导航 Ctrl+F 搜索, 因为微信 4.x 全局搜索第一项常常是"公众号"而非"文件传输助手", Enter 会误入)
|
|
// (Phase A 时再处理导航问题,可能用方向键 + Enter 或 VLM 看屏点击列表项)
|
|
// 这里只验证 "粘贴 + 发送 + verify" 核心链路是否通
|
|
|
|
// ====================== Step 4: VLM 验证当前对话就是文件传输助手 ======================
|
|
let shot1: Buffer;
|
|
try {
|
|
shot1 = await captureWindowPng(win);
|
|
// ★ DEBUG: 落盘截图
|
|
const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
|
|
fs.mkdirSync(debugDir, { recursive: true });
|
|
fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step4-verify-in-chat.png`), shot1);
|
|
} catch (e: any) {
|
|
return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'capture failed: ' + (e?.message || e) };
|
|
}
|
|
|
|
let vlm: VlmResponse;
|
|
try {
|
|
vlm = await callVlm(shot1, `仔细看截图。微信主窗口右侧是当前打开的聊天对话,顶部应该有该聊天的名字。判断当前打开的聊天是否就是"${TARGET_ROOM}"(看右侧上方的标题文字)。如果是, 输出 {"type":"finished","reason":"current chat is ${TARGET_ROOM}"}。如果不是 (打开的是其他聊天 / 公众号面板 / 通讯录), 输出 {"type":"failed","reason":"当前聊天是XX, 不是${TARGET_ROOM}"}。`);
|
|
modelCalls++;
|
|
totalTokens += vlm.totalTokens;
|
|
trace.push({ step: 4, action: vlm.action, raw: vlm.raw });
|
|
console.log(` vlm-1 (verify-in-chat) action=${JSON.stringify(vlm.action)} tokens=${vlm.totalTokens} latency=${vlm.latencyMs}ms`);
|
|
} catch (e: any) {
|
|
return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: e?.message || String(e) };
|
|
}
|
|
|
|
if (vlm.action.type !== 'finished') {
|
|
return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'not in target chat: ' + vlm.action.reason };
|
|
}
|
|
|
|
// ====================== Step 5: 粘贴 + Enter ======================
|
|
await input.clipboardyWrite(text);
|
|
await sleep(150);
|
|
await input.nut.keyboard.pressKey(input.nut.Key.LeftControl);
|
|
await input.nut.keyboard.pressKey(input.nut.Key.V);
|
|
await input.nut.keyboard.releaseKey(input.nut.Key.V);
|
|
await input.nut.keyboard.releaseKey(input.nut.Key.LeftControl);
|
|
await sleep(jitter(300, 600));
|
|
trace.push({ step: 5, action: { type: 'type', text } });
|
|
|
|
await input.nut.keyboard.pressKey(input.nut.Key.Enter);
|
|
await input.nut.keyboard.releaseKey(input.nut.Key.Enter);
|
|
await sleep(jitter(800, 1300));
|
|
trace.push({ step: 6, action: { type: 'hotkey', key: 'enter' } });
|
|
|
|
// ====================== Step 7: VLM 验证消息已发送 ======================
|
|
let shot2: Buffer;
|
|
try {
|
|
shot2 = await captureWindowPng(win);
|
|
// ★ DEBUG: 落盘截图
|
|
const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
|
|
fs.mkdirSync(debugDir, { recursive: true });
|
|
fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step7-verify-sent.png`), shot2);
|
|
} catch (e: any) {
|
|
return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify capture failed: ' + (e?.message || e) };
|
|
}
|
|
|
|
let vlm2: VlmResponse;
|
|
try {
|
|
vlm2 = await callVlm(shot2, `仔细看截图。微信聊天窗口右侧底部是最新消息。判断最新一条消息(气泡显示在屏幕右侧的, 即己方发出的)的文本是否包含 "${text}" (注意是字面字符串, 不要看意思)。如果包含, 输出 {"type":"finished","reason":"message visible at bottom"}。否则输出 {"type":"failed","reason":"最底部最新消息内容: <实际看到的文本>"}。`);
|
|
modelCalls++;
|
|
totalTokens += vlm2.totalTokens;
|
|
trace.push({ step: 7, action: vlm2.action, raw: vlm2.raw });
|
|
console.log(` vlm-2 (verify-sent) action=${JSON.stringify(vlm2.action)} tokens=${vlm2.totalTokens} latency=${vlm2.latencyMs}ms`);
|
|
} catch (e: any) {
|
|
return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify-sent model err: ' + (e?.message || e) };
|
|
}
|
|
|
|
if (vlm2.action.type === 'finished') {
|
|
return { index, text, status: 'success', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace };
|
|
}
|
|
return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'message not visible in chat: ' + vlm2.action.reason };
|
|
}
|
|
|
|
// ===== main =====
|
|
async function main() {
|
|
console.log('========================================');
|
|
console.log('Visual Agent Phase 0 PoC');
|
|
console.log(` model: ${MODEL_NAME}`);
|
|
console.log(` baseUrl: ${ARK_BASE_URL}`);
|
|
console.log(` apiKey: ${ARK_API_KEY.slice(0, 12)}...`);
|
|
console.log(` N runs: ${N_RUNS}`);
|
|
console.log(` target: ${TARGET_ROOM}`);
|
|
console.log('========================================');
|
|
|
|
if (process.platform !== 'win32') {
|
|
console.error('This probe only runs on Windows.');
|
|
process.exit(1);
|
|
}
|
|
|
|
loadKoffi();
|
|
ensureDpiAware();
|
|
console.log('DPI Aware set.');
|
|
|
|
const input = await loadInput();
|
|
console.log('nut.js + clipboardy loaded.');
|
|
|
|
const win = findWeixinWindow();
|
|
if (!win) {
|
|
console.error('WeChat (Weixin / 微信) window not found. Open it and try again.');
|
|
process.exit(1);
|
|
}
|
|
console.log(`WeChat window found. hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);
|
|
|
|
console.log('\nStarting in 8 seconds. NOW switch to WeChat and open 文件传输助手. DO NOT touch keyboard/mouse after.\n');
|
|
await sleep(8000);
|
|
|
|
const startedAt = Date.now();
|
|
const ts = Date.now();
|
|
const results: RunResult[] = [];
|
|
|
|
for (let i = 1; i <= N_RUNS; i++) {
|
|
console.log(`\n--- Run ${i}/${N_RUNS} ---`);
|
|
let r: RunResult;
|
|
try {
|
|
r = await runOnce(i, input, ts);
|
|
} catch (e: any) {
|
|
r = { index: i, text: `[probe-${i}-${ts}]`, status: 'exception', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: 0, trace: [], error: e?.message || String(e) };
|
|
}
|
|
results.push(r);
|
|
console.log(` result: status=${r.status} steps=${r.steps} modelCalls=${r.modelCalls} tokens=${r.totalTokens} dur=${r.durationMs}ms${r.error ? ' err=' + r.error : ''}`);
|
|
if (i < N_RUNS) {
|
|
console.log(' cooldown 3s ...');
|
|
await sleep(3000);
|
|
}
|
|
}
|
|
|
|
const elapsed = Date.now() - startedAt;
|
|
const successCount = results.filter(r => r.status === 'success').length;
|
|
const successRate = (successCount / N_RUNS * 100).toFixed(1);
|
|
const totalTokens = results.reduce((s, r) => s + r.totalTokens, 0);
|
|
const avgDur = Math.round(results.reduce((s, r) => s + r.durationMs, 0) / N_RUNS);
|
|
const avgTokens = Math.round(totalTokens / N_RUNS);
|
|
|
|
console.log('\n========================================');
|
|
console.log('Summary');
|
|
console.log(` total runs: ${N_RUNS}`);
|
|
console.log(` successes: ${successCount}`);
|
|
console.log(` success rate: ${successRate}%`);
|
|
console.log(` total elapsed: ${(elapsed / 1000).toFixed(1)}s`);
|
|
console.log(` avg duration/run: ${avgDur}ms`);
|
|
console.log(` total tokens: ${totalTokens}`);
|
|
console.log(` avg tokens/run: ${avgTokens}`);
|
|
console.log('========================================');
|
|
console.log('Status breakdown:');
|
|
const byStatus: Record<string, number> = {};
|
|
for (const r of results) byStatus[r.status] = (byStatus[r.status] || 0) + 1;
|
|
for (const [k, v] of Object.entries(byStatus)) console.log(` ${k}: ${v}`);
|
|
console.log('========================================');
|
|
|
|
// 落盘 JSON 结果 + raw VLM 输出, 供后续 Task 3 fixtures 使用
|
|
const reportDir = path.join(REPO_ROOT, 'docs', 'superpowers', 'followups');
|
|
fs.mkdirSync(reportDir, { recursive: true });
|
|
const reportPath = path.join(reportDir, `2026-05-14-visual-agent-poc-raw.json`);
|
|
fs.writeFileSync(reportPath, JSON.stringify({
|
|
model: MODEL_NAME,
|
|
baseUrl: ARK_BASE_URL,
|
|
runCount: N_RUNS,
|
|
elapsed,
|
|
successCount,
|
|
successRate: Number(successRate),
|
|
avgDurationMs: avgDur,
|
|
totalTokens,
|
|
byStatus,
|
|
results,
|
|
}, null, 2));
|
|
console.log(`Raw report written to ${reportPath}`);
|
|
|
|
console.log('\nNow check WeChat 文件传输助手: you should see ' + N_RUNS + ' messages like [probe-i-' + ts + '].');
|
|
console.log('If success rate >= 80%, proceed to Phase A.');
|
|
console.log('If < 80%, stop and discuss fallback (UI-TARS / Sonnet vision / different prompt).');
|
|
}
|
|
|
|
main().catch(e => {
|
|
console.error('FATAL:', e?.stack || e);
|
|
process.exit(1);
|
|
});
|