GPU_GUARD_MONOREPO/tools/visual_agent_probe/run-once.ts

/**
 * Phase 0 PoC: Visual Agent 端到端验真
 *
 * 目标:验证 "截屏 -> 火山 VLM -> 解析 -> nut.js 点击 + 粘贴 -> 发送" 这条链路
 *      在微信 4.x 文件传输助手里能不能跑通,跑 N 次统计成功率。
 *
 * 不依赖 neta backend。所有依赖直接通过 npm 安装到 backend node_modules。
 * 数据库凭据从 netaclaw_model_channel id=2 直接读(也支持 .env 覆盖)。
 *
 * 用法:
 *   1) 打开 PC 微信 + 登录 + 打开"文件传输助手"对话(主窗口可见,不要最小化)
 *   2) cd 到仓库根目录,跑:
 *        pnpm --filter @neta/backend exec tsx ../../tools/visual_agent_probe/run-once.ts [N]
 *      N = 跑几次(默认 3,完整门禁验证应跑 20)
 *   3) 等待运行,期间不要碰键盘鼠标
 *   4) 完成后看微信"文件传输助手"是否收到 N 条 [probe-i-ts] 消息
 *   5) 控制台会打印逐次结果 + 总成功率
 *
 * 注意:全程不要切窗口/碰鼠标键盘。脚本每条间隔 3 秒。
 */

/* eslint-disable @typescript-eslint/no-explicit-any */

import * as fs from 'node:fs';
import * as path from 'node:path';

// 仓库根路径(脚本在 tools/visual_agent_probe/ 下)
const REPO_ROOT = path.resolve(__dirname, '..', '..');

// 让 require 能从 backend node_modules 找到依赖
require('module').Module._initPaths();
const backendNodeModules = path.join(REPO_ROOT, 'packages', 'backend', 'node_modules');
if (fs.existsSync(backendNodeModules)) {
  (require('module').Module as any).globalPaths.unshift(backendNodeModules);
}

// ===== 配置 =====
const ARGS = process.argv.slice(2);
const N_RUNS = Number(ARGS[0] || 3);

// 火山引擎凭据:优先 .env,否则用 model_channel id=2 的硬编码值
const ARK_API_KEY = process.env.ARK_API_KEY || 'bb264a23-eb57-4c46-92e0-14339857e537';
const ARK_BASE_URL = process.env.ARK_BASE_URL || 'https://ark.cn-beijing.volces.com/api/v3';
const MODEL_NAME = process.env.MODEL_NAME || 'doubao-seed-2-0-pro-260215';

const TARGET_ROOM = '文件传输助手';

// ===== Win32 API via koffi =====
let koffi: any;
let user32: any;
let kernel32: any;

function loadKoffi() {
  const koffiPath = path.join(backendNodeModules, 'koffi');
  koffi = require(koffiPath);
  user32 = koffi.load('user32.dll');
  kernel32 = koffi.load('kernel32.dll');
}

function ensureDpiAware() {
  if (process.platform !== 'win32') return;
  try {
    const SetProcessDpiAwarenessContext = user32.func('SetProcessDpiAwarenessContext', 'bool', ['intptr']);
    // DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2 = -4
    SetProcessDpiAwarenessContext(-4);
  } catch (e: any) {
    console.warn('  [dpi] SetProcessDpiAwarenessContext failed:', e?.message || e);
  }
}

interface WinHandle {
  hwnd: number;
  pid: number;
  title: string;
  bounds: { x: number; y: number; width: number; height: number };
  // node-screenshots Window 实例的引用, 用于截图
  nsWindow: any;
}

function findWeixinWindow(): WinHandle | null {
  if (process.platform !== 'win32') return null;
  // 用 node-screenshots 自己枚举,选 appName='Weixin' 中面积最大的那个(主窗口)
  const ssPath = path.join(backendNodeModules, 'node-screenshots');
  const screenshots = require(ssPath);
  const wins = screenshots.Window.all();
  let best: any = null;
  let bestArea = 0;
  for (const w of wins) {
    try {
      const app = w.appName();
      if (app !== 'Weixin' && app !== 'WeChat') continue;
      if (w.isMinimized()) continue;
      const area = w.width() * w.height();
      if (area > bestArea) {
        bestArea = area;
        best = w;
      }
    } catch { /* skip */ }
  }
  if (!best) return null;
  return {
    hwnd: best.id(),
    pid: best.pid(),
    title: best.title(),
    bounds: { x: best.x(), y: best.y(), width: best.width(), height: best.height() },
    nsWindow: best,
  };
}

function activateWindow(hwnd: number) {
  const SetForegroundWindow = user32.func('SetForegroundWindow', 'bool', ['intptr']);
  const ShowWindow = user32.func('ShowWindow', 'bool', ['intptr', 'int']);
  const BringWindowToTop = user32.func('BringWindowToTop', 'bool', ['intptr']);
  const IsIconic = user32.func('IsIconic', 'bool', ['intptr']);
  if (IsIconic(hwnd)) {
    ShowWindow(hwnd, 9); // SW_RESTORE
  }
  BringWindowToTop(hwnd);
  SetForegroundWindow(hwnd);
}

// ===== 截屏 =====
// 注意:必须每次截图前重新 enumerate, 否则 node-screenshots Window 对象会返回缓存的旧帧
async function captureWindowPng(_winIgnored: WinHandle): Promise<Buffer> {
  const fresh = findWeixinWindow();
  if (!fresh) throw new Error('weixin window vanished');
  const image = fresh.nsWindow.captureImageSync();
  return Buffer.from(image.toPngSync());
}

// ===== 剪贴板 (用 Windows clip.exe, 避免 clipboardy v5 ESM 问题) =====
function clipboardWrite(text: string): void {
  const { spawnSync } = require('node:child_process');
  // clip.exe 从 stdin 读取,要求 UTF-16 LE BOM 才能正确处理中文
  const buf = Buffer.concat([
    Buffer.from([0xff, 0xfe]),
    Buffer.from(text, 'utf16le'),
  ]);
  const r = spawnSync('clip.exe', [], { input: buf });
  if (r.status !== 0) throw new Error('clip.exe failed: ' + (r.stderr?.toString() || ''));
}

// ===== 键鼠 (nut.js) =====
async function loadInput() {
  // 显式从 backend node_modules 加载 nut.js (它是 CJS, 没问题)
  const nutPath = path.join(backendNodeModules, '@nut-tree-fork', 'nut-js');
  const nut = require(nutPath);
  // 把 nut 默认间隔调短,但保留一点抖动
  nut.keyboard.config.autoDelayMs = 30;
  nut.mouse.config.autoDelayMs = 30;
  return { nut, clipboardyWrite: clipboardWrite };
}

const sleep = (ms: number) => new Promise(r => setTimeout(r, ms));
const jitter = (min: number, max: number) => Math.floor(min + Math.random() * (max - min));

// ===== VLM 调用 =====
interface VlmResponse {
  raw: string;
  action: { type: string; x?: number; y?: number; key?: string; text?: string; reason?: string };
  promptTokens: number;
  completionTokens: number;
  totalTokens: number;
  latencyMs: number;
}

async function callVlm(screenshot: Buffer, task: string): Promise<VlmResponse> {
  const oaiPath = path.join(backendNodeModules, 'openai');
  const OpenAIMod = require(oaiPath);
  const OpenAI = OpenAIMod.default || OpenAIMod;
  const client = new OpenAI({ apiKey: ARK_API_KEY, baseURL: ARK_BASE_URL });

  const b64 = screenshot.toString('base64');
  const t0 = Date.now();

  const systemPrompt = `你是一个 Windows 桌面操作助手。
看到的是 PC 微信 4.x 主窗口截图。
你的任务: ${task}

输出严格 JSON, 不要任何解释或代码块标记:
{"type":"click","x":<int>,"y":<int>,"reason":"<简短理由>"}
或 {"type":"hotkey","key":"ctrl+v"|"enter"|"ctrl+f"|"ctrl+1","reason":"..."}
或 {"type":"finished","reason":"..."}
或 {"type":"failed","reason":"为何无法完成"}

只输出一个动作 JSON。click 坐标必须在截图范围内, 以截图左上角为原点。`;

  const resp = await client.chat.completions.create({
    model: MODEL_NAME,
    messages: [
      { role: 'system', content: systemPrompt },
      {
        role: 'user',
        content: [
          { type: 'text', text: '请给出下一步动作的 JSON。' },
          { type: 'image_url', image_url: { url: 'data:image/png;base64,' + b64 } },
        ],
      },
    ],
    temperature: 0.2,
    max_tokens: 400,
  });

  const latencyMs = Date.now() - t0;
  const raw = resp.choices?.[0]?.message?.content?.trim() || '';
  let action: any = { type: 'failed', reason: 'parse-error' };
  try {
    // 去掉可能的 markdown 代码块
    const cleaned = raw.replace(/^```(?:json)?\s*|\s*```$/g, '').trim();
    action = JSON.parse(cleaned);
  } catch (e: any) {
    action = { type: 'failed', reason: 'json-parse-failed: ' + raw.slice(0, 200) };
  }

  return {
    raw,
    action,
    promptTokens: resp.usage?.prompt_tokens || 0,
    completionTokens: resp.usage?.completion_tokens || 0,
    totalTokens: resp.usage?.total_tokens || 0,
    latencyMs,
  };
}

// ===== 单次 run =====
interface RunResult {
  index: number;
  text: string;
  status: 'success' | 'window-missing' | 'verify-failed' | 'model-failed' | 'action-failed' | 'exception';
  steps: number;
  modelCalls: number;
  totalTokens: number;
  durationMs: number;
  error?: string;
  trace: Array<{ step: number; action: any; raw?: string }>;
}

async function runOnce(index: number, input: any, ts: number): Promise<RunResult> {
  const text = `[probe-${index}-${ts}]`;
  const trace: any[] = [];
  let modelCalls = 0;
  let totalTokens = 0;
  const t0 = Date.now();

  const win = findWeixinWindow();
  if (!win) {
    return { index, text, status: 'window-missing', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: Date.now() - t0, trace, error: 'weixin not running' };
  }
  console.log(`  win: hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);
  activateWindow(win.hwnd);
  await sleep(600);

  // ====================== Phase 0 简化:假设文件传输助手已经被手动打开为当前聊天 ======================
  // (不做导航 Ctrl+F 搜索, 因为微信 4.x 全局搜索第一项常常是"公众号"而非"文件传输助手", Enter 会误入)
  // (Phase A 时再处理导航问题,可能用方向键 + Enter 或 VLM 看屏点击列表项)
  // 这里只验证 "粘贴 + 发送 + verify" 核心链路是否通

  // ====================== Step 4: VLM 验证当前对话就是文件传输助手 ======================
  let shot1: Buffer;
  try {
    shot1 = await captureWindowPng(win);
    // ★ DEBUG: 落盘截图
    const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
    fs.mkdirSync(debugDir, { recursive: true });
    fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step4-verify-in-chat.png`), shot1);
  } catch (e: any) {
    return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'capture failed: ' + (e?.message || e) };
  }

  let vlm: VlmResponse;
  try {
    vlm = await callVlm(shot1, `仔细看截图。微信主窗口右侧是当前打开的聊天对话,顶部应该有该聊天的名字。判断当前打开的聊天是否就是"${TARGET_ROOM}"(看右侧上方的标题文字)。如果是, 输出 {"type":"finished","reason":"current chat is ${TARGET_ROOM}"}。如果不是 (打开的是其他聊天 / 公众号面板 / 通讯录), 输出 {"type":"failed","reason":"当前聊天是XX, 不是${TARGET_ROOM}"}。`);
    modelCalls++;
    totalTokens += vlm.totalTokens;
    trace.push({ step: 4, action: vlm.action, raw: vlm.raw });
    console.log(`  vlm-1 (verify-in-chat) action=${JSON.stringify(vlm.action)} tokens=${vlm.totalTokens} latency=${vlm.latencyMs}ms`);
  } catch (e: any) {
    return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: e?.message || String(e) };
  }

  if (vlm.action.type !== 'finished') {
    return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'not in target chat: ' + vlm.action.reason };
  }

  // ====================== Step 5: 粘贴 + Enter ======================
  await input.clipboardyWrite(text);
  await sleep(150);
  await input.nut.keyboard.pressKey(input.nut.Key.LeftControl);
  await input.nut.keyboard.pressKey(input.nut.Key.V);
  await input.nut.keyboard.releaseKey(input.nut.Key.V);
  await input.nut.keyboard.releaseKey(input.nut.Key.LeftControl);
  await sleep(jitter(300, 600));
  trace.push({ step: 5, action: { type: 'type', text } });

  await input.nut.keyboard.pressKey(input.nut.Key.Enter);
  await input.nut.keyboard.releaseKey(input.nut.Key.Enter);
  await sleep(jitter(800, 1300));
  trace.push({ step: 6, action: { type: 'hotkey', key: 'enter' } });

  // ====================== Step 7: VLM 验证消息已发送 ======================
  let shot2: Buffer;
  try {
    shot2 = await captureWindowPng(win);
    // ★ DEBUG: 落盘截图
    const debugDir = path.join(REPO_ROOT, 'tools', 'visual_agent_probe', 'debug');
    fs.mkdirSync(debugDir, { recursive: true });
    fs.writeFileSync(path.join(debugDir, `run-${index}-${ts}-step7-verify-sent.png`), shot2);
  } catch (e: any) {
    return { index, text, status: 'exception', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify capture failed: ' + (e?.message || e) };
  }

  let vlm2: VlmResponse;
  try {
    vlm2 = await callVlm(shot2, `仔细看截图。微信聊天窗口右侧底部是最新消息。判断最新一条消息(气泡显示在屏幕右侧的, 即己方发出的)的文本是否包含 "${text}" (注意是字面字符串, 不要看意思)。如果包含, 输出 {"type":"finished","reason":"message visible at bottom"}。否则输出 {"type":"failed","reason":"最底部最新消息内容: <实际看到的文本>"}。`);
    modelCalls++;
    totalTokens += vlm2.totalTokens;
    trace.push({ step: 7, action: vlm2.action, raw: vlm2.raw });
    console.log(`  vlm-2 (verify-sent)  action=${JSON.stringify(vlm2.action)} tokens=${vlm2.totalTokens} latency=${vlm2.latencyMs}ms`);
  } catch (e: any) {
    return { index, text, status: 'model-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'verify-sent model err: ' + (e?.message || e) };
  }

  if (vlm2.action.type === 'finished') {
    return { index, text, status: 'success', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace };
  }
  return { index, text, status: 'verify-failed', steps: trace.length, modelCalls, totalTokens, durationMs: Date.now() - t0, trace, error: 'message not visible in chat: ' + vlm2.action.reason };
}

// ===== main =====
async function main() {
  console.log('========================================');
  console.log('Visual Agent Phase 0 PoC');
  console.log(`  model:    ${MODEL_NAME}`);
  console.log(`  baseUrl:  ${ARK_BASE_URL}`);
  console.log(`  apiKey:   ${ARK_API_KEY.slice(0, 12)}...`);
  console.log(`  N runs:   ${N_RUNS}`);
  console.log(`  target:   ${TARGET_ROOM}`);
  console.log('========================================');

  if (process.platform !== 'win32') {
    console.error('This probe only runs on Windows.');
    process.exit(1);
  }

  loadKoffi();
  ensureDpiAware();
  console.log('DPI Aware set.');

  const input = await loadInput();
  console.log('nut.js + clipboardy loaded.');

  const win = findWeixinWindow();
  if (!win) {
    console.error('WeChat (Weixin / 微信) window not found. Open it and try again.');
    process.exit(1);
  }
  console.log(`WeChat window found. hwnd=${win.hwnd} title=${win.title} bounds=${JSON.stringify(win.bounds)}`);

  console.log('\nStarting in 8 seconds. NOW switch to WeChat and open 文件传输助手. DO NOT touch keyboard/mouse after.\n');
  await sleep(8000);

  const startedAt = Date.now();
  const ts = Date.now();
  const results: RunResult[] = [];

  for (let i = 1; i <= N_RUNS; i++) {
    console.log(`\n--- Run ${i}/${N_RUNS} ---`);
    let r: RunResult;
    try {
      r = await runOnce(i, input, ts);
    } catch (e: any) {
      r = { index: i, text: `[probe-${i}-${ts}]`, status: 'exception', steps: 0, modelCalls: 0, totalTokens: 0, durationMs: 0, trace: [], error: e?.message || String(e) };
    }
    results.push(r);
    console.log(`  result: status=${r.status} steps=${r.steps} modelCalls=${r.modelCalls} tokens=${r.totalTokens} dur=${r.durationMs}ms${r.error ? ' err=' + r.error : ''}`);
    if (i < N_RUNS) {
      console.log('  cooldown 3s ...');
      await sleep(3000);
    }
  }

  const elapsed = Date.now() - startedAt;
  const successCount = results.filter(r => r.status === 'success').length;
  const successRate = (successCount / N_RUNS * 100).toFixed(1);
  const totalTokens = results.reduce((s, r) => s + r.totalTokens, 0);
  const avgDur = Math.round(results.reduce((s, r) => s + r.durationMs, 0) / N_RUNS);
  const avgTokens = Math.round(totalTokens / N_RUNS);

  console.log('\n========================================');
  console.log('Summary');
  console.log(`  total runs:        ${N_RUNS}`);
  console.log(`  successes:         ${successCount}`);
  console.log(`  success rate:      ${successRate}%`);
  console.log(`  total elapsed:     ${(elapsed / 1000).toFixed(1)}s`);
  console.log(`  avg duration/run:  ${avgDur}ms`);
  console.log(`  total tokens:      ${totalTokens}`);
  console.log(`  avg tokens/run:    ${avgTokens}`);
  console.log('========================================');
  console.log('Status breakdown:');
  const byStatus: Record<string, number> = {};
  for (const r of results) byStatus[r.status] = (byStatus[r.status] || 0) + 1;
  for (const [k, v] of Object.entries(byStatus)) console.log(`  ${k}: ${v}`);
  console.log('========================================');

  // 落盘 JSON 结果 + raw VLM 输出, 供后续 Task 3 fixtures 使用
  const reportDir = path.join(REPO_ROOT, 'docs', 'superpowers', 'followups');
  fs.mkdirSync(reportDir, { recursive: true });
  const reportPath = path.join(reportDir, `2026-05-14-visual-agent-poc-raw.json`);
  fs.writeFileSync(reportPath, JSON.stringify({
    model: MODEL_NAME,
    baseUrl: ARK_BASE_URL,
    runCount: N_RUNS,
    elapsed,
    successCount,
    successRate: Number(successRate),
    avgDurationMs: avgDur,
    totalTokens,
    byStatus,
    results,
  }, null, 2));
  console.log(`Raw report written to ${reportPath}`);

  console.log('\nNow check WeChat 文件传输助手: you should see ' + N_RUNS + ' messages like [probe-i-' + ts + '].');
  console.log('If success rate >= 80%, proceed to Phase A.');
  console.log('If < 80%, stop and discuss fallback (UI-TARS / Sonnet vision / different prompt).');
}

main().catch(e => {
  console.error('FATAL:', e?.stack || e);
  process.exit(1);
});