432 lines
18 KiB
TypeScript
Raw Normal View History

2026-05-20 21:39:12 +08:00
/**
* Phase 0 PoC: Visual Agent
*
* 目标:验证 "截屏 -> 火山 VLM -> 解析 -> nut.js 点击 + 粘贴 -> 发送"
* 4.x , N
*
* neta backend npm backend node_modules
* netaclaw_model_channel id=2 ( .env )
*
* :
* 1) PC + + "文件传输助手"(,)
* 2) cd ,:
* pnpm --filter @neta/backend exec tsx ../../tools/visual_agent_probe/run-once.ts [N]
* N = ( 3, 20)
* 3) ,
* 4) "文件传输助手" N [probe-i-ts]
* 5) +
*
* 注意:全程不要切窗口/ 3
*/
/* eslint-disable @typescript-eslint/no-explicit-any */
import * as fs from 'node:fs';
import * as path from 'node:path';
// 仓库根路径(脚本在 tools/visual_agent_probe/ 下)
const REPO_ROOT = path.resolve(__dirname, '..', '..');
// 让 require 能从 backend node_modules 找到依赖
require('module').Module._initPaths();
const backendNodeModules = path.join(REPO_ROOT, 'packages', 'backend', 'node_modules');
if (fs.existsSync(backendNodeModules)) {
(require('module').Module as any).globalPaths.unshift(backendNodeModules);
}
// ===== 配置 =====
const ARGS = process.argv.slice(2);
const N_RUNS = Number(ARGS[0] || 3);
// 火山引擎凭据:优先 .env,否则用 model_channel id=2 的硬编码值
const ARK_API_KEY = process.env.ARK_API_KEY || 'bb264a23-eb57-4c46-92e0-14339857e537';
const ARK_BASE_URL = process.env.ARK_BASE_URL || 'https://ark.cn-beijing.volces.com/api/v3';
const MODEL_NAME = process.env.MODEL_NAME || 'doubao-seed-2-0-pro-260215';
const TARGET_ROOM = '文件传输助手';
// ===== Win32 API via koffi =====
let koffi: any;
let user32: any;
let kernel32: any;
function loadKoffi() {
const koffiPath = path.join(backendNodeModules, 'koffi');
koffi = require(koffiPath);
user32 = koffi.load('user32.dll');
kernel32 = koffi.load('kernel32.dll');
}
function ensureDpiAware() {
if (process.platform !== 'win32') return;
try {
const SetProcessDpiAwarenessContext = user32.func('SetProcessDpiAwarenessContext', 'bool', ['intptr']);
// DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2 = -4
SetProcessDpiAwarenessContext(-4);
} catch (e: any) {
console.warn(' [dpi] SetProcessDpiAwarenessContext failed:', e?.message || e);
}
}
interface WinHandle {
hwnd: number;
pid: number;
title: string;
bounds: { x: number; y: number; width: number; height: number };
// node-screenshots Window 实例的引用, 用于截图
nsWindow: any;
}
function findWeixinWindow(): WinHandle | null {
if (process.platform !== 'win32') return null;
// 用 node-screenshots 自己枚举,选 appName='Weixin' 中面积最大的那个(主窗口)
const ssPath = path.join(backendNodeModules, 'node-screenshots');
const screenshots = require(ssPath);
const wins = screenshots.Window.all();
let best: any = null;
let bestArea = 0;
for (const w of wins) {
try {
const app = w.appName();
if (app !== 'Weixin' && app !== 'WeChat') continue;
if (w.isMinimized()) continue;
const area = w.width() * w.height();
if (area > bestArea) {
bestArea = area;
best = w;
}
} catch { /* skip */ }
}
if (!best) return null;
return {
hwnd: best.id(),
pid: best.pid(),
title: best.title(),
bounds: { x: best.x(), y: best.y(), width: best.width(), height: best.height() },
nsWindow: best,
};
}
function activateWindow(hwnd: number) {
const SetForegroundWindow = user32.func('SetForegroundWindow', 'bool', ['intptr']);
const ShowWindow = user32.func('ShowWindow', 'bool', ['intptr', 'int']);
const BringWindowToTop = user32.func('BringWindowToTop', 'bool', ['intptr']);
const IsIconic = user32.func('IsIconic', 'bool', ['intptr']);
if (IsIconic(hwnd)) {
ShowWindow(hwnd, 9); // SW_RESTORE
}
BringWindowToTop(hwnd);
SetForegroundWindow(hwnd);
}
// ===== 截屏 =====
// 注意:必须每次截图前重新 enumerate, 否则 node-screenshots Window 对象会返回缓存的旧帧
async function captureWindowPng(_winIgnored: WinHandle): Promise<Buffer> {
const fresh = findWeixinWindow();
if (!fresh) throw new Error('weixin window vanished');
const image = fresh.nsWindow.captureImageSync();
return Buffer.from(image.toPngSync());
}
// ===== 剪贴板 (用 Windows clip.exe, 避免 clipboardy v5 ESM 问题) =====
function clipboardWrite(text: string): void {
const { spawnSync } = require('node:child_process');
// clip.exe 从 stdin 读取,要求 UTF-16 LE BOM 才能正确处理中文
const buf = Buffer.concat([
Buffer.from([0xff, 0xfe]),
Buffer.from(text, 'utf16le'),
]);
const r = spawnSync('clip.exe', [], { input: buf });
if (r.status !== 0) throw new Error('clip.exe failed: ' + (r.stderr?.toString() || ''));
}
// ===== 键鼠 (nut.js) =====
async function loadInput() {
// 显式从 backend node_modules 加载 nut.js (它是 CJS, 没问题)
const nutPath = path.join(backendNodeModules, '@nut-tree-fork', 'nut-js');
const nut = require(nutPath);
// 把 nut 默认间隔调短,但保留一点抖动
nut.keyboard.config.autoDelayMs = 30;
nut.mouse.config.autoDelayMs = 30;
return { nut, clipboardyWrite: clipboardWrite };
}
const sleep = (ms: number) => new Promise(r => setTimeout(r, ms));
const jitter = (min: number, max: number) => Math.floor(min + Math.random() * (max - min));
// ===== VLM 调用 =====
interface VlmResponse {
raw: string;
action: { type: string; x?: number; y?: number; key?: string; text?: string; reason?: string };
promptTokens: number;
completionTokens: number;
totalTokens: number;
latencyMs: number;
}
async function callVlm(screenshot: Buffer, task: string): Promise<VlmResponse> {
const oaiPath = path.join(backendNodeModules, 'openai');
const OpenAIMod = require(oaiPath);
const OpenAI = OpenAIMod.default || OpenAIMod;
const client = new OpenAI({ apiKey: ARK_API_KEY, baseURL: ARK_BASE_URL });
const b64 = screenshot.toString('base64');
const t0 = Date.now();
const systemPrompt = `你是一个 Windows 桌面操作助手。
PC 4.x
你的任务: ${task}
JSON, :
{"type":"click","x":<int>,"y":<int>,"reason":"<简短理由>"}
{"type":"hotkey","key":"ctrl+v"|"enter"|"ctrl+f"|"ctrl+1","reason":"..."}
{"type":"finished","reason":"..."}
{"type":"failed","reason":"为何无法完成"}
JSONclick , `;
const resp = await client.chat.completions.create({
model: MODEL_NAME,
messages: [
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
{ type: 'text', text: '请给出下一步动作的 JSON。' },
{ type: 'image_url', image_url: { url: 'data:image/png;base64,' + b64 } },
],
},
],
temperature: 0.2,
max_tokens: 400,
});
const latencyMs = Date.now() - t0;
const raw = resp.choices?.[0]?.message?.content?.trim() || '';
let action: any = { type: 'failed', reason: 'parse-error' };
try {
// 去掉可能的 markdown 代码块
const cleaned = raw.replace(/^```(?:json)?\s*|\s*```$/g, '').trim();
action = JSON.parse(cleaned);
} catch (e: any) {
action = { type: 'failed', reason: 'json-parse-failed: ' + raw.slice(0, 200) };
}
return {
raw,
action,
promptTokens: resp.usage?.prompt_tokens || 0,
completionTokens: resp.usage?.completion_tokens || 0,
totalTokens: resp.usage?.total_tokens || 0,
latencyMs,
};
}
// ===== 单次 run =====
interface RunResult {
index: number;
text: string;
status: 'success' | 'window-missing' | 'verify-failed' | 'model-failed' | 'action-failed' | 'exception';
steps: number;
modelCalls: number;
totalTokens: number;
durationMs: number;
error?: string;
trace: Array<{ step: number; action: any; raw?: string }>;
}
async function runOnce(index: number, input: any, ts: number): Promise<RunResult> {
const text = `[probe-${index}-${ts}]`;
const trace: any[] = [];
let modelCalls = 0;
let totalTokens = 0;
const t0 = Date.now();
const win = findWeixinWindow();
if (!win) {
return { index, text, status: 'window-missing', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: Date.now() - t0, trace, error: 'weixin not running' };
}
console.log(` win: hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);
activateWindow(win.hwnd);
await sleep(600);
// ====================== Phase 0 简化:假设文件传输助手已经被手动打开为当前聊天 ======================
// (不做导航 Ctrl+F 搜索, 因为微信 4.x 全局搜索第一项常常是"公众号"而非"文件传输助手", Enter 会误入)
// (Phase A 时再处理导航问题,可能用方向键 + Enter 或 VLM 看屏点击列表项)
// 这里只验证 "粘贴 + 发送 + verify" 核心链路是否通
// ====================== Step 4: VLM 验证当前对话就是文件传输助手 ======================
let shot1: Buffer;
try {
shot1 = await captureWindowPng(win);
// ★ DEBUG: 落盘截图
const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
fs.mkdirSync(debugDir, { recursive: true });
fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step4-verify-in-chat.png`), shot1);
} catch (e: any) {
return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'capture failed: ' + (e?.message || e) };
}
let vlm: VlmResponse;
try {
vlm = await callVlm(shot1, `仔细看截图。微信主窗口右侧是当前打开的聊天对话,顶部应该有该聊天的名字。判断当前打开的聊天是否就是"${TARGET_ROOM}"(看右侧上方的标题文字)。如果是, 输出 {"type":"finished","reason":"current chat is ${TARGET_ROOM}"}。如果不是 (打开的是其他聊天 / 公众号面板 / 通讯录), 输出 {"type":"failed","reason":"当前聊天是XX, 不是${TARGET_ROOM}"}。`);
modelCalls++;
totalTokens += vlm.totalTokens;
trace.push({ step: 4, action: vlm.action, raw: vlm.raw });
console.log(` vlm-1 (verify-in-chat) action=${JSON.stringify(vlm.action)} tokens=${vlm.totalTokens} latency=${vlm.latencyMs}ms`);
} catch (e: any) {
return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: e?.message || String(e) };
}
if (vlm.action.type !== 'finished') {
return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'not in target chat: ' + vlm.action.reason };
}
// ====================== Step 5: 粘贴 + Enter ======================
await input.clipboardyWrite(text);
await sleep(150);
await input.nut.keyboard.pressKey(input.nut.Key.LeftControl);
await input.nut.keyboard.pressKey(input.nut.Key.V);
await input.nut.keyboard.releaseKey(input.nut.Key.V);
await input.nut.keyboard.releaseKey(input.nut.Key.LeftControl);
await sleep(jitter(300, 600));
trace.push({ step: 5, action: { type: 'type', text } });
await input.nut.keyboard.pressKey(input.nut.Key.Enter);
await input.nut.keyboard.releaseKey(input.nut.Key.Enter);
await sleep(jitter(800, 1300));
trace.push({ step: 6, action: { type: 'hotkey', key: 'enter' } });
// ====================== Step 7: VLM 验证消息已发送 ======================
let shot2: Buffer;
try {
shot2 = await captureWindowPng(win);
// ★ DEBUG: 落盘截图
const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
fs.mkdirSync(debugDir, { recursive: true });
fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step7-verify-sent.png`), shot2);
} catch (e: any) {
return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify capture failed: ' + (e?.message || e) };
}
let vlm2: VlmResponse;
try {
vlm2 = await callVlm(shot2, `仔细看截图。微信聊天窗口右侧底部是最新消息。判断最新一条消息(气泡显示在屏幕右侧的, 即己方发出的)的文本是否包含 "${text}" (注意是字面字符串, 不要看意思)。如果包含, 输出 {"type":"finished","reason":"message visible at bottom"}。否则输出 {"type":"failed","reason":"最底部最新消息内容: <实际看到的文本>"}。`);
modelCalls++;
totalTokens += vlm2.totalTokens;
trace.push({ step: 7, action: vlm2.action, raw: vlm2.raw });
console.log(` vlm-2 (verify-sent) action=${JSON.stringify(vlm2.action)} tokens=${vlm2.totalTokens} latency=${vlm2.latencyMs}ms`);
} catch (e: any) {
return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify-sent model err: ' + (e?.message || e) };
}
if (vlm2.action.type === 'finished') {
return { index, text, status: 'success', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace };
}
return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'message not visible in chat: ' + vlm2.action.reason };
}
// ===== main =====
async function main() {
console.log('========================================');
console.log('Visual Agent Phase 0 PoC');
console.log(` model: ${MODEL_NAME}`);
console.log(` baseUrl: ${ARK_BASE_URL}`);
console.log(` apiKey: ${ARK_API_KEY.slice(0, 12)}...`);
console.log(` N runs: ${N_RUNS}`);
console.log(` target: ${TARGET_ROOM}`);
console.log('========================================');
if (process.platform !== 'win32') {
console.error('This probe only runs on Windows.');
process.exit(1);
}
loadKoffi();
ensureDpiAware();
console.log('DPI Aware set.');
const input = await loadInput();
console.log('nut.js + clipboardy loaded.');
const win = findWeixinWindow();
if (!win) {
console.error('WeChat (Weixin / 微信) window not found. Open it and try again.');
process.exit(1);
}
console.log(`WeChat window found. hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);
console.log('\nStarting in 8 seconds. NOW switch to WeChat and open 文件传输助手. DO NOT touch keyboard/mouse after.\n');
await sleep(8000);
const startedAt = Date.now();
const ts = Date.now();
const results: RunResult[] = [];
for (let i = 1; i <= N_RUNS; i++) {
console.log(`\n--- Run ${i}/${N_RUNS} ---`);
let r: RunResult;
try {
r = await runOnce(i, input, ts);
} catch (e: any) {
r = { index: i, text: `[probe-${i}-${ts}]`, status: 'exception', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: 0, trace: [], error: e?.message || String(e) };
}
results.push(r);
console.log(` result: status=${r.status} steps=${r.steps} modelCalls=${r.modelCalls} tokens=${r.totalTokens} dur=${r.durationMs}ms${r.error ? ' err=' + r.error : ''}`);
if (i < N_RUNS) {
console.log(' cooldown 3s ...');
await sleep(3000);
}
}
const elapsed = Date.now() - startedAt;
const successCount = results.filter(r => r.status === 'success').length;
const successRate = (successCount / N_RUNS * 100).toFixed(1);
const totalTokens = results.reduce((s, r) => s + r.totalTokens, 0);
const avgDur = Math.round(results.reduce((s, r) => s + r.durationMs, 0) / N_RUNS);
const avgTokens = Math.round(totalTokens / N_RUNS);
console.log('\n========================================');
console.log('Summary');
console.log(` total runs: ${N_RUNS}`);
console.log(` successes: ${successCount}`);
console.log(` success rate: ${successRate}%`);
console.log(` total elapsed: ${(elapsed / 1000).toFixed(1)}s`);
console.log(` avg duration/run: ${avgDur}ms`);
console.log(` total tokens: ${totalTokens}`);
console.log(` avg tokens/run: ${avgTokens}`);
console.log('========================================');
console.log('Status breakdown:');
const byStatus: Record<string, number> = {};
for (const r of results) byStatus[r.status] = (byStatus[r.status] || 0) + 1;
for (const [k, v] of Object.entries(byStatus)) console.log(` ${k}: ${v}`);
console.log('========================================');
// 落盘 JSON 结果 + raw VLM 输出, 供后续 Task 3 fixtures 使用
const reportDir = path.join(REPO_ROOT, 'docs', 'superpowers', 'followups');
fs.mkdirSync(reportDir, { recursive: true });
const reportPath = path.join(reportDir, `2026-05-14-visual-agent-poc-raw.json`);
fs.writeFileSync(reportPath, JSON.stringify({
model: MODEL_NAME,
baseUrl: ARK_BASE_URL,
runCount: N_RUNS,
elapsed,
successCount,
successRate: Number(successRate),
avgDurationMs: avgDur,
totalTokens,
byStatus,
results,
}, null, 2));
console.log(`Raw report written to ${reportPath}`);
console.log('\nNow check WeChat 文件传输助手: you should see ' + N_RUNS + ' messages like [probe-i-' + ts + '].');
console.log('If success rate >= 80%, proceed to Phase A.');
console.log('If < 80%, stop and discuss fallback (UI-TARS / Sonnet vision / different prompt).');
}
main().catch(e => {
console.error('FATAL:', e?.stack || e);
process.exit(1);
});