xcbosa
/
AppleGPUTest


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
							import Metal
import MetalKit
import Foundation

class OptimizedMetalBenchmark {
    private let device: MTLDevice
    private let commandQueue: MTLCommandQueue
    private let pipelineStateFP32: MTLComputePipelineState
    private let pipelineStateFP16: MTLComputePipelineState
    
    // 优化参数
    let elementsPerThread = 4  // 每个线程处理4个元素
    let loopUnrollFactor = 512 // 循环展开因子
    let arrayLength = 1 << 26  // 调整为4M元素（减少内存压力）
    let iterations = 100
    let threadgroupWidth = 256
    
    init() {
        guard let device = MTLCreateSystemDefaultDevice(),
              let commandQueue = device.makeCommandQueue() else {
            fatalError("Metal初始化失败")
        }
        
        self.device = device
        self.commandQueue = commandQueue
        
        let library = try! device.makeLibrary(source: metalCode, options: nil)
        let fp32Function = library.makeFunction(name: "optimized_fp32")!
        let fp16Function = library.makeFunction(name: "optimized_fp16")!
        
        do {
            pipelineStateFP32 = try device.makeComputePipelineState(function: fp32Function)
            pipelineStateFP16 = try device.makeComputePipelineState(function: fp16Function)
        } catch {
            fatalError("创建计算管线失败: \(error)")
        }
        
        print("优化版Metal基准测试初始化完成 - 设备: \(device.name)")
    }
    
    func runOptimizedBenchmark() {
        print("\n运行优化版FP32测试...")
        let fp32Result = runTest(pipeline: pipelineStateFP32, precision: "FP32")
        
        print("\n运行优化版FP16测试...")
        let fp16Result = runTest(pipeline: pipelineStateFP16, precision: "FP16")
        
        print("\n最终结果:")
        print("FP32峰值性能: \(String(format: "%.2f", fp32Result)) TFLOPS")
        print("FP16峰值性能: \(String(format: "%.2f", fp16Result)) TFLOPS")
    }
    
    private func runTest(pipeline: MTLComputePipelineState, precision: String) -> Double {
        let bufferSize = arrayLength * MemoryLayout<Float>.size
        guard let buffer = device.makeBuffer(length: bufferSize, options: .storageModePrivate) else {
            fatalError("缓冲区创建失败")
        }
        
        let threadsPerGrid = arrayLength / elementsPerThread
        let threadgroups = MTLSize(
            width: (threadsPerGrid + threadgroupWidth - 1) / threadgroupWidth,
            height: 1,
            depth: 1
        )
        
        let threadgroupSize = MTLSize(width: threadgroupWidth, height: 1, depth: 1)
        
        // 预热
        for _ in 0..<3 {
            runKernel(pipeline: pipeline, buffer: buffer,
                     threadsPerGrid: threadsPerGrid,
                     threadgroups: threadgroups,
                     threadgroupSize: threadgroupSize)
        }
        
        // 正式测试
        let start = CFAbsoluteTimeGetCurrent()
        for _ in 0..<iterations {
            runKernel(pipeline: pipeline, buffer: buffer,
                     threadsPerGrid: threadsPerGrid,
                     threadgroups: threadgroups,
                     threadgroupSize: threadgroupSize)
        }
        let elapsed = CFAbsoluteTimeGetCurrent() - start
        
        // 计算FLOPs: 每个线程处理4个元素，每次循环128次，每次循环8 FLOPs
        let totalFLOPs = Double(arrayLength) * Double(loopUnrollFactor) * 8.0 * Double(iterations)
        let tflops = (totalFLOPs / elapsed) / 1e12
        
        print("\(precision) 结果:")
        print("- 总时间: \(String(format: "%.3f", elapsed))秒")
        print("- 理论计算量: \(String(format: "%.1f", totalFLOPs/1e12)) TFLOP")
        print("- 实测性能: \(String(format: "%.2f", tflops)) TFLOPS")
        
        return tflops
    }
    
    private func runKernel(pipeline: MTLComputePipelineState,
                          buffer: MTLBuffer,
                          threadsPerGrid: Int,
                          threadgroups: MTLSize,
                          threadgroupSize: MTLSize) {
        guard let commandBuffer = commandQueue.makeCommandBuffer(),
              let encoder = commandBuffer.makeComputeCommandEncoder() else {
            fatalError("创建命令对象失败")
        }
        
        encoder.setComputePipelineState(pipeline)
        encoder.setBuffer(buffer, offset: 0, index: 0)
        
        encoder.dispatchThreadgroups(threadgroups,
                                    threadsPerThreadgroup: threadgroupSize)
        encoder.endEncoding()
        
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()
    }
}

// 运行优化版测试
let benchmark = OptimizedMetalBenchmark()
benchmark.runOptimizedBenchmark()