xcbosa mbp16 3 tháng trước cách đây
mục cha
commit
3a17d2461c

+ 79 - 0
gputest.xcodeproj/xcshareddata/xcschemes/gputest.xcscheme

@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1600"
+   version = "1.7">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES"
+      buildArchitectures = "Automatic">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "757057102DDA334D00F5DBF8"
+               BuildableName = "gputest"
+               BlueprintName = "gputest"
+               ReferencedContainer = "container:gputest.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      shouldAutocreateTestPlan = "YES">
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Release"
+      selectedDebuggerIdentifier = ""
+      selectedLauncherIdentifier = "Xcode.IDEFoundation.Launcher.PosixSpawn"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES"
+      viewDebuggingEnabled = "No">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "757057102DDA334D00F5DBF8"
+            BuildableName = "gputest"
+            BlueprintName = "gputest"
+            ReferencedContainer = "container:gputest.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "757057102DDA334D00F5DBF8"
+            BuildableName = "gputest"
+            BlueprintName = "gputest"
+            ReferencedContainer = "container:gputest.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>

+ 8 - 0
gputest.xcodeproj/xcuserdata/xcbosa.xcuserdatad/xcschemes/xcschememanagement.plist

@@ -10,5 +10,13 @@
 			<integer>0</integer>
 		</dict>
 	</dict>
+	<key>SuppressBuildableAutocreation</key>
+	<dict>
+		<key>757057102DDA334D00F5DBF8</key>
+		<dict>
+			<key>primary</key>
+			<true/>
+		</dict>
+	</dict>
 </dict>
 </plist>

+ 37 - 0
gputest/MetalShader.metal

@@ -0,0 +1,37 @@
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void optimized_fp32(
+    device float4 *buffer [[buffer(0)]],
+    uint tid [[thread_position_in_grid]]
+) {
+    // 每个线程处理4个float元素
+    float4 x = buffer[tid];
+    
+    // 展开循环增加计算强度
+    for (int i = 0; i < 512; ++i) {
+        // 每个循环8次浮点运算 x4 => 32 FLOPs/循环
+        x = fma(x, float4(1.01), float4(0.97));
+        x = fma(x, float4(0.99), float4(1.02));
+        x = fma(x, float4(1.03), float4(0.98));
+        x = fma(x, float4(0.96), float4(1.05));
+    }
+    
+    buffer[tid] = x;
+}
+
+kernel void optimized_fp16(
+    device half4 *buffer [[buffer(0)]],
+    uint tid [[thread_position_in_grid]]
+) {
+    half4 x = buffer[tid];
+    
+    for (int i = 0; i < 512; ++i) {
+        x = fma(x, half4(1.01), half4(0.97));
+        x = fma(x, half4(0.99), half4(1.02));
+        x = fma(x, half4(1.03), half4(0.98));
+        x = fma(x, half4(0.96), half4(1.05));
+    }
+    
+    buffer[tid] = x;
+}

+ 119 - 8
gputest/main.swift

@@ -1,11 +1,122 @@
-//
-//  main.swift
-//  gputest
-//
-//  Created by 邢铖 on 2025/5/18.
-//
-
+import Metal
+import MetalKit
 import Foundation
 
-print("Hello, World!")
+class OptimizedMetalBenchmark {
+    private let device: MTLDevice
+    private let commandQueue: MTLCommandQueue
+    private let pipelineStateFP32: MTLComputePipelineState
+    private let pipelineStateFP16: MTLComputePipelineState
+    
+    // 优化参数
+    let elementsPerThread = 4  // 每个线程处理4个元素
+    let loopUnrollFactor = 512 // 循环展开因子
+    let arrayLength = 1 << 22  // 调整为4M元素(减少内存压力)
+    let iterations = 100
+    let threadgroupWidth = 256
+    
+    init() {
+        guard let device = MTLCreateSystemDefaultDevice(),
+              let commandQueue = device.makeCommandQueue() else {
+            fatalError("Metal初始化失败")
+        }
+        
+        self.device = device
+        self.commandQueue = commandQueue
+        
+        let library = device.makeDefaultLibrary()!
+        let fp32Function = library.makeFunction(name: "optimized_fp32")!
+        let fp16Function = library.makeFunction(name: "optimized_fp16")!
+        
+        do {
+            pipelineStateFP32 = try device.makeComputePipelineState(function: fp32Function)
+            pipelineStateFP16 = try device.makeComputePipelineState(function: fp16Function)
+        } catch {
+            fatalError("创建计算管线失败: \(error)")
+        }
+        
+        print("优化版Metal基准测试初始化完成 - 设备: \(device.name)")
+    }
+    
+    func runOptimizedBenchmark() {
+        print("\n运行优化版FP32测试...")
+        let fp32Result = runTest(pipeline: pipelineStateFP32, precision: "FP32")
+        
+        print("\n运行优化版FP16测试...")
+        let fp16Result = runTest(pipeline: pipelineStateFP16, precision: "FP16")
+        
+        print("\n最终结果:")
+        print("FP32峰值性能: \(String(format: "%.2f", fp32Result)) TFLOPS")
+        print("FP16峰值性能: \(String(format: "%.2f", fp16Result)) TFLOPS")
+    }
+    
+    private func runTest(pipeline: MTLComputePipelineState, precision: String) -> Double {
+        let bufferSize = arrayLength * MemoryLayout<Float>.size
+        guard let buffer = device.makeBuffer(length: bufferSize, options: .storageModePrivate) else {
+            fatalError("缓冲区创建失败")
+        }
+        
+        let threadsPerGrid = arrayLength / elementsPerThread
+        let threadgroups = MTLSize(
+            width: (threadsPerGrid + threadgroupWidth - 1) / threadgroupWidth,
+            height: 1,
+            depth: 1
+        )
+        
+        let threadgroupSize = MTLSize(width: threadgroupWidth, height: 1, depth: 1)
+        
+        // 预热
+        for _ in 0..<3 {
+            runKernel(pipeline: pipeline, buffer: buffer,
+                     threadsPerGrid: threadsPerGrid,
+                     threadgroups: threadgroups,
+                     threadgroupSize: threadgroupSize)
+        }
+        
+        // 正式测试
+        let start = CFAbsoluteTimeGetCurrent()
+        for _ in 0..<iterations {
+            runKernel(pipeline: pipeline, buffer: buffer,
+                     threadsPerGrid: threadsPerGrid,
+                     threadgroups: threadgroups,
+                     threadgroupSize: threadgroupSize)
+        }
+        let elapsed = CFAbsoluteTimeGetCurrent() - start
+        
+        // 计算FLOPs: 每个线程处理4个元素,每次循环128次,每次循环8 FLOPs
+        let totalFLOPs = Double(arrayLength) * Double(loopUnrollFactor) * 8.0 * Double(iterations)
+        let tflops = (totalFLOPs / elapsed) / 1e12
+        
+        print("\(precision) 结果:")
+        print("- 总时间: \(String(format: "%.3f", elapsed))秒")
+        print("- 理论计算量: \(String(format: "%.1f", totalFLOPs/1e12)) TFLOP")
+        print("- 实测性能: \(String(format: "%.2f", tflops)) TFLOPS")
+        
+        return tflops
+    }
+    
+    private func runKernel(pipeline: MTLComputePipelineState,
+                          buffer: MTLBuffer,
+                          threadsPerGrid: Int,
+                          threadgroups: MTLSize,
+                          threadgroupSize: MTLSize) {
+        guard let commandBuffer = commandQueue.makeCommandBuffer(),
+              let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError("创建命令对象失败")
+        }
+        
+        encoder.setComputePipelineState(pipeline)
+        encoder.setBuffer(buffer, offset: 0, index: 0)
+        
+        encoder.dispatchThreadgroups(threadgroups,
+                                    threadsPerThreadgroup: threadgroupSize)
+        encoder.endEncoding()
+        
+        commandBuffer.commit()
+        commandBuffer.waitUntilCompleted()
+    }
+}
 
+// 运行优化版测试
+let benchmark = OptimizedMetalBenchmark()
+benchmark.runOptimizedBenchmark()