OA0

OA0 是一个探索 AI 的社区

现在注册

已注册用户请登录

名称： perf-profiler
描述： 性能分析与优化工具。适用于诊断代码缓慢、测量CPU/内存使用率、生成火焰图、函数基准测试、API负载测试、查找内存泄漏或优化数据库查询等场景。
元数据： {"clawdbot":{"emoji":"⚡","requires":{"anyBins":["node","python3","go","curl","ab"]},"os":["linux","darwin","win32"]}}

性能分析器

测量、剖析并优化应用程序性能。涵盖CPU剖析、内存分析、火焰图、基准测试、负载测试以及特定语言的优化模式。

适用场景

诊断应用程序或函数运行缓慢的原因
测量CPU和内存使用率
生成火焰图以可视化热点路径
对函数或API端点进行基准测试
部署前对API进行负载测试
查找并修复内存泄漏
优化数据库查询性能
对比变更前后的性能差异

快速计时

命令行计时

# 计时任何命令
time my-command --flag

# 更精确：多次运行并统计
for i in $(seq 1 10); do
  /usr/bin/time -f "%e" my-command 2>&1
done | awk '{sum+=$1; sumsq+=$1*$1; count++} END {
  avg=sum/count;
  stddev=sqrt(sumsq/count - avg*avg);
  printf "runs=%d avg=%.3fs stddev=%.3fs\n", count, avg, stddev
}'

# Hyperfine（更好的基准测试工具）
# 安装：https://github.com/sharkdp/hyperfine
hyperfine 'command-a' 'command-b'
hyperfine --warmup 3 --runs 20 'my-command'
hyperfine --export-json results.json 'old-version' 'new-version'

内联计时（适用于任何语言）

// Node.js
console.time('operation');
await doExpensiveThing();
console.timeEnd('operation'); // 输出："operation: 142.3ms"

// 高精度计时
const start = performance.now();
await doExpensiveThing();
const elapsed = performance.now() - start;
console.log(`Elapsed: ${elapsed.toFixed(2)}ms`);

# Python
import time

start = time.perf_counter()
do_expensive_thing()
elapsed = time.perf_counter() - start
print(f"Elapsed: {elapsed:.4f}s")

# 上下文管理器
from contextlib import contextmanager

@contextmanager
def timer(label=""):
    start = time.perf_counter()
    yield
    elapsed = time.perf_counter() - start
    print(f"{label}: {elapsed:.4f}s")

with timer("data processing"):
    process_data()

// Go
start := time.Now()
doExpensiveThing()
fmt.Printf("Elapsed: %v\n", time.Since(start))

Node.js 性能剖析

使用 V8 检查器进行 CPU 剖析

# 生成 CPU 剖析文件（写入 .cpuprofile 文件）
node --cpu-prof app.js
# 在 Chrome DevTools > Performance 标签页中打开 .cpuprofile 文件

# 针对特定时长进行剖析
node --cpu-prof --cpu-prof-interval=100 app.js

# 检查运行中的进程
node --inspect app.js
# 在 Chrome 中打开 chrome://inspect，点击 "inspect"
# 转到 Performance 标签页，点击 Record 开始记录

堆快照（内存分析）

# 生成堆快照
node --heap-prof app.js

# 以编程方式获取快照
node -e "
const v8 = require('v8');
const fs = require('fs');

// 获取快照
const snapshotStream = v8.writeHeapSnapshot();
console.log('Heap snapshot written to:', snapshotStream);
"

# 比较堆快照以查找泄漏：
# 1. 获取快照 A（基线）
# 2. 运行可能泄漏的操作
# 3. 获取快照 B
# 4. 在 Chrome DevTools > Memory 中加载两者并使用 "Comparison" 视图

内存使用监控

// 定期打印内存使用情况
setInterval(() => {
  const usage = process.memoryUsage();
  console.log({
    rss: `${(usage.rss / 1024 / 1024).toFixed(1)}MB`,
    heapUsed: `${(usage.heapUsed / 1024 / 1024).toFixed(1)}MB`,
    heapTotal: `${(usage.heapTotal / 1024 / 1024).toFixed(1)}MB`,
    external: `${(usage.external / 1024 / 1024).toFixed(1)}MB`,
  });
}, 5000);

// 检测内存增长
let lastHeap = 0;
setInterval(() => {
  const heap = process.memoryUsage().heapUsed;
  const delta = heap - lastHeap;
  if (delta > 1024 * 1024) { // 增长超过 1MB
    console.warn(`Heap grew by ${(delta / 1024 / 1024).toFixed(1)}MB`);
  }
  lastHeap = heap;
}, 10000);

Node.js 基准测试

// 简单的基准测试函数
function benchmark(name, fn, iterations = 10000) {
  // 预热
  for (let i = 0; i < 100; i++) fn();

  const start = performance.now();
  for (let i = 0; i < iterations; i++) fn();
  const elapsed = performance.now() - start;

  console.log(`${name}: ${(elapsed / iterations).toFixed(4)}ms/op (${iterations} iterations in ${elapsed.toFixed(1)}ms)`);
}

benchmark('JSON.parse', () => JSON.parse('{"key":"value","num":42}'));
benchmark('regex match', () => /^\d{4}-\d{2}-\d{2}$/.test('2026-02-03'));

Python 性能剖析

cProfile（内置 CPU 剖析器）

# 剖析脚本
python3 -m cProfile -s cumulative my_script.py

# 保存到文件以供分析
python3 -m cProfile -o profile.prof my_script.py

# 分析已保存的剖析文件
python3 -c "
import pstats
stats = pstats.Stats('profile.prof')
stats.sort_stats('cumulative')
stats.print_stats(20)
"

# 剖析特定函数
python3 -c "
import cProfile
from my_module import expensive_function

cProfile.run('expensive_function()', sort='cumulative')
"

line_profiler（逐行剖析）

# 安装
pip install line_profiler

# 在目标函数上添加 @profile 装饰器，然后运行：
kernprof -l -v my_script.py

# 编程式用法
from line_profiler import LineProfiler

def process_data(data):
    result = []
    for item in data:           # 这个循环是瓶颈吗？
        transformed = transform(item)
        if validate(transformed):
            result.append(transformed)
    return result

profiler = LineProfiler()
profiler.add_function(process_data)
profiler.enable()
process_data(large_dataset)
profiler.disable()
profiler.print_stats()

内存剖析（Python）

# memory_profiler
pip install memory_profiler

# 逐行剖析内存使用
python3 -m memory_profiler my_script.py

from memory_profiler import profile

@profile
def load_data():
    data = []
    for i in range(1000000):
        data.append({'id': i, 'value': f'item_{i}'})
    return data

# 跟踪一段时间内的内存使用
import tracemalloc

tracemalloc.start()

# ... 运行代码 ...

snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
for stat in top_stats[:10]:
    print(stat)

Python 基准测试

import timeit

# 计时一个语句
result = timeit.timeit('sorted(range(1000))', number=10000)
print(f"sorted: {result:.4f}s for 10000 iterations")

# 比较两种方法
setup = "data = list(range(10000))"
t1 = timeit.timeit('list(filter(lambda x: x % 2 == 0, data))', setup=setup, number=1000)
t2 = timeit.timeit('[x for x in data if x % 2 == 0]', setup=setup, number=1000)
print(f"filter: {t1:.4f}s  |  listcomp: {t2:.4f}s  |  speedup: {t1/t2:.2f}x")

# pytest-benchmark
# pip install pytest-benchmark
# def test_sort(benchmark):
#     benchmark(sorted, list(range(1000)))

Go 性能剖析

内置 pprof

// 添加到 main.go 以启用可通过 HTTP 访问的剖析
import (
    "net/http"
    _ "net/http/pprof"
)

func main() {
    go func() {
        http.ListenAndServe("localhost:6060", nil)
    }()
    // ... 应用程序其余部分
}

# CPU 剖析（30 秒）
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# 内存剖析
go tool pprof http://localhost:6060/debug/pprof/heap

# Goroutine 剖析
go tool pprof http://localhost:6060/debug/pprof/goroutine

# 在 pprof 交互模式中：
# top 20          - 按 CPU/内存排序的前 20 个函数
# list funcName   - 带注释的源代码
# web             - 在浏览器中打开火焰图
# png > out.png   - 将调用图保存为图片

Go 基准测试

// math_test.go
func BenchmarkAdd(b *testing.B) {
    for i := 0; i < b.N; i++ {
        Add(42, 58)
    }
}

func BenchmarkSort1000(b *testing.B) {
    data := make([]int, 1000)
    for i := range data {
        data[i] = rand.Intn(1000)
    }
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        sort.Ints(append([]int{}, data...))
    }
}

# 运行基准测试
go test -bench=. -benchmem ./...

# 比较变更前后
go test -bench=. -count=5 ./... > old.txt
# ... 进行更改 ...
go test -bench=. -count=5 ./... > new.txt
go install golang.org/x/perf/cmd/benchstat@latest
benchstat old.txt new.txt

火焰图

生成火焰图

# Node.js: 0x（最简单）
npx 0x app.js
# 在浏览器中打开交互式火焰图

# Node.js: clinic.js（功能全面）
npx clinic flame -- node app.js
npx clinic doctor -- node app.js
npx clinic bubbleprof -- node app.js

# Python: py-spy（采样剖析器，无需修改代码）
pip install py-spy
py-spy record -o flame.svg -- python3 my_script.py

# 剖析运行中的 Python 进程
py-spy record -o flame.svg --pid 12345

# Go: 内置
go tool pprof -http=:8080 http://localhost:6060/debug/pprof/profile?seconds=30
# 导航到 "Flame Graph" 视图

# Linux（任何进程）: perf + flamegraph
perf record -g -p PID -- sleep 30
perf script | stackcollapse-perf.pl | flamegraph.pl > flame.svg

解读火焰图

关键概念：
- X轴：不是时间。它是栈帧的字母顺序排序。宽度 = 样本百分比。
- Y轴：栈深度。顶部 = 叶子函数（CPU 时间花费的地方）。
- 顶部的宽条 = 热点函数（优先优化这些）。
- 窄而高的栈 = 深层调用链（可能表示过度抽象）。

需要关注的内容：
1. 顶部的宽平台 → 主导 CPU 时间的函数
2. 多条路径汇聚到一个函数 → 共享瓶颈
3. GC/运行时帧占据显著宽度 → 内存压力
4. 意外出现的宽函数 → 性能缺陷

负载测试

基于 curl 的快速测试

# 单个请求计时
curl -o /dev/null -s -w "HTTP %{http_code} | Total: %{time_total}s | TTFB: %{time_starttransfer}s | Connect: %{time_connect}s\n" https://api.example.com/endpoint

# 顺序执行多个请求
for i in $(seq 1 20); do
  curl -o /dev/null -s -w "%{time_total}\n" https://api.example.com/endpoint
done | awk '{sum+=$1; count++; if($1>max)max=$1} END {printf "avg=%.3fs max=%.3fs n=%d\n", sum/count, max, count}'

Apache Bench (ab)

# 100 个请求，10 个并发
ab -n 100 -c 10 http://localhost:3000/api/endpoint

# 使用 POST 数据
ab -n 100 -c 10 -p data.json -T application/json http://localhost:3000/api/endpoint

# 需要关注的关键指标：
# - 每秒请求数（吞吐量）
# - 每个请求的时间（延迟）
# - 在一定时间内服务的请求百分比（p50, p90, p99）

wrk（现代负载测试工具）

# 安装：https://github.com/wg/wrk
# 10 秒，4 个线程，100 个连接
wrk -t4 -c100 -d10s http://localhost:3000/api/endpoint

# 使用 Lua 脚本进行自定义请求
wrk -t4 -c100 -d10s -s post.lua http://localhost:3000/api/endpoint

-- post.lua
wrk.method = "POST"
wrk.body   = '{"key": "value"}'
wrk.headers["Content-Type"] = "application/json"

-- 自定义请求生成
request = function()
  local id = math.random(1, 10000)
  local path = "/api/users/" .. id
  return wrk.format("GET", path)
end

Autocannon（Node.js 负载测试）

npx autocannon -c 100 -d 10 http://localhost:3000/api/endpoint
npx autocannon -c 100 -d 10 -m POST -b '{"key":"value"}' -H 'Content-Type=application/json' http://localhost:3000/api/endpoint

数据库查询性能

EXPLAIN 分析

# PostgreSQL
psql -c "EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) SELECT * FROM orders WHERE user_id = 123;"

# MySQL
mysql -e "EXPLAIN SELECT * FROM orders WHERE user_id = 123;" mydb

# SQLite
sqlite3 mydb.sqlite "EXPLAIN QUERY PLAN SELECT * FROM orders WHERE user_id = 123;"

慢查询检测

# PostgreSQL: 启用慢查询日志
# 在 postgresql.conf 中：
# log_min_duration_statement = 100  (毫秒)

# MySQL: 慢查询日志
# 在 my.cnf 中：
# slow_query_log = 1
# long_query_time = 0.1

# 查找缺少索引的查询（PostgreSQL）
psql -c "
SELECT schemaname, relname, seq_scan, seq_tup_read,
       idx_scan, idx_tup_fetch,
       seq_tup_read / GREATEST(seq_scan, 1) AS avg_rows_per_scan
FROM pg_stat_user_tables
WHERE seq_scan > 100 AND seq_tup_read / GREATEST(seq_scan, 1) > 1000
ORDER BY seq_tup_read DESC
LIMIT 10;
"

内存泄漏检测模式

Node.js

// 跟踪一段时间内的对象计数
const v8 = require('v8');

function checkMemory() {
  const heap = v8.getHeapStatistics();
  const usage = process.memoryUsage();
  return {
    heapUsedMB: (usage.heapUsed / 1024 / 1024).toFixed(1),
    heapTotalMB: (usage.heapTotal / 1024 / 1024).toFixed(1),
    rssMB: (usage.rss / 1024 / 1024).toFixed(1),
    externalMB: (usage.external / 1024 / 1024).toFixed(1),
    arrayBuffersMB: (usage.arrayBuffers / 1024 / 1024).toFixed(1),
  };
}

// 每 10 秒采样一次，在增长时发出警报
let baseline = process.memoryUsage().heapUsed;
setInterval(() => {
  const current = process.memoryUsage().heapUsed;
  const growthMB = (current - baseline) / 1024 / 1024;
  if (growthMB > 50) {
    console.warn(`Memory grew ${growthMB.toFixed(1)}MB since start`);
    console.warn(checkMemory());
  }
}, 10000);

常见泄漏模式

```
Node.js:
- 未移除的事件监听器（使用了 emitter.on 但没有 emitter.off）
- 闭包在长生命周期作用域中捕获了大对象
- 没有淘汰机制的全局缓存（只增不减的 Map/Set）
- 累积的未解决 Promise

Python:
- 循环引用（对缓存使用 weakref）
- 无限增长的全局列表

技能包地址：https://github.com/openclaw/skills/tree/main/skills/gitgoodordietrying/perf-profiler/SKILL.md

64 次点击 ∙ 0 人收藏

登录后收藏

0 条回复

perf-profiler：分析并优化应用程序性能