123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- import Metal
- import MetalKit
- import Foundation
- class OptimizedMetalBenchmark {
- private let device: MTLDevice
- private let commandQueue: MTLCommandQueue
- private let pipelineStateFP32: MTLComputePipelineState
- private let pipelineStateFP16: MTLComputePipelineState
-
- // 优化参数
- let elementsPerThread = 4 // 每个线程处理4个元素
- let loopUnrollFactor = 512 // 循环展开因子
- let arrayLength = 1 << 26 // 调整为4M元素(减少内存压力)
- let iterations = 100
- let threadgroupWidth = 256
-
- init() {
- guard let device = MTLCreateSystemDefaultDevice(),
- let commandQueue = device.makeCommandQueue() else {
- fatalError("Metal初始化失败")
- }
-
- self.device = device
- self.commandQueue = commandQueue
-
- let library = try! device.makeLibrary(source: metalCode, options: nil)
- let fp32Function = library.makeFunction(name: "optimized_fp32")!
- let fp16Function = library.makeFunction(name: "optimized_fp16")!
-
- do {
- pipelineStateFP32 = try device.makeComputePipelineState(function: fp32Function)
- pipelineStateFP16 = try device.makeComputePipelineState(function: fp16Function)
- } catch {
- fatalError("创建计算管线失败: \(error)")
- }
-
- print("优化版Metal基准测试初始化完成 - 设备: \(device.name)")
- }
-
- func runOptimizedBenchmark() {
- print("\n运行优化版FP32测试...")
- let fp32Result = runTest(pipeline: pipelineStateFP32, precision: "FP32")
-
- print("\n运行优化版FP16测试...")
- let fp16Result = runTest(pipeline: pipelineStateFP16, precision: "FP16")
-
- print("\n最终结果:")
- print("FP32峰值性能: \(String(format: "%.2f", fp32Result)) TFLOPS")
- print("FP16峰值性能: \(String(format: "%.2f", fp16Result)) TFLOPS")
- }
-
- private func runTest(pipeline: MTLComputePipelineState, precision: String) -> Double {
- let bufferSize = arrayLength * MemoryLayout<Float>.size
- guard let buffer = device.makeBuffer(length: bufferSize, options: .storageModePrivate) else {
- fatalError("缓冲区创建失败")
- }
-
- let threadsPerGrid = arrayLength / elementsPerThread
- let threadgroups = MTLSize(
- width: (threadsPerGrid + threadgroupWidth - 1) / threadgroupWidth,
- height: 1,
- depth: 1
- )
-
- let threadgroupSize = MTLSize(width: threadgroupWidth, height: 1, depth: 1)
-
- // 预热
- for _ in 0..<3 {
- runKernel(pipeline: pipeline, buffer: buffer,
- threadsPerGrid: threadsPerGrid,
- threadgroups: threadgroups,
- threadgroupSize: threadgroupSize)
- }
-
- // 正式测试
- let start = CFAbsoluteTimeGetCurrent()
- for _ in 0..<iterations {
- runKernel(pipeline: pipeline, buffer: buffer,
- threadsPerGrid: threadsPerGrid,
- threadgroups: threadgroups,
- threadgroupSize: threadgroupSize)
- }
- let elapsed = CFAbsoluteTimeGetCurrent() - start
-
- // 计算FLOPs: 每个线程处理4个元素,每次循环128次,每次循环8 FLOPs
- let totalFLOPs = Double(arrayLength) * Double(loopUnrollFactor) * 8.0 * Double(iterations)
- let tflops = (totalFLOPs / elapsed) / 1e12
-
- print("\(precision) 结果:")
- print("- 总时间: \(String(format: "%.3f", elapsed))秒")
- print("- 理论计算量: \(String(format: "%.1f", totalFLOPs/1e12)) TFLOP")
- print("- 实测性能: \(String(format: "%.2f", tflops)) TFLOPS")
-
- return tflops
- }
-
- private func runKernel(pipeline: MTLComputePipelineState,
- buffer: MTLBuffer,
- threadsPerGrid: Int,
- threadgroups: MTLSize,
- threadgroupSize: MTLSize) {
- guard let commandBuffer = commandQueue.makeCommandBuffer(),
- let encoder = commandBuffer.makeComputeCommandEncoder() else {
- fatalError("创建命令对象失败")
- }
-
- encoder.setComputePipelineState(pipeline)
- encoder.setBuffer(buffer, offset: 0, index: 0)
-
- encoder.dispatchThreadgroups(threadgroups,
- threadsPerThreadgroup: threadgroupSize)
- encoder.endEncoding()
-
- commandBuffer.commit()
- commandBuffer.waitUntilCompleted()
- }
- }
- // 运行优化版测试
- let benchmark = OptimizedMetalBenchmark()
- benchmark.runOptimizedBenchmark()
|