3 tháng trước cách đây · 3a17d2461c
--- a/gputest.xcodeproj/xcshareddata/xcschemes/gputest.xcscheme
+++ b/gputest.xcodeproj/xcshareddata/xcschemes/gputest.xcscheme
@@ -0,0 +1,79 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<Scheme
			
 
				+   LastUpgradeVersion = "1600"
			
 
				+   version = "1.7">
			
 
				+   <BuildAction
			
 
				+      parallelizeBuildables = "YES"
			
 
				+      buildImplicitDependencies = "YES"
			
 
				+      buildArchitectures = "Automatic">
			
 
				+      <BuildActionEntries>
			
 
				+         <BuildActionEntry
			
 
				+            buildForTesting = "YES"
			
 
				+            buildForRunning = "YES"
			
 
				+            buildForProfiling = "YES"
			
 
				+            buildForArchiving = "YES"
			
 
				+            buildForAnalyzing = "YES">
			
 
				+            <BuildableReference
			
 
				+               BuildableIdentifier = "primary"
			
 
				+               BlueprintIdentifier = "757057102DDA334D00F5DBF8"
			
 
				+               BuildableName = "gputest"
			
 
				+               BlueprintName = "gputest"
			
 
				+               ReferencedContainer = "container:gputest.xcodeproj">
			
 
				+            </BuildableReference>
			
 
				+         </BuildActionEntry>
			
 
				+      </BuildActionEntries>
			
 
				+   </BuildAction>
			
 
				+   <TestAction
			
 
				+      buildConfiguration = "Debug"
			
 
				+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
			
 
				+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
			
 
				+      shouldUseLaunchSchemeArgsEnv = "YES"
			
 
				+      shouldAutocreateTestPlan = "YES">
			
 
				+   </TestAction>
			
 
				+   <LaunchAction
			
 
				+      buildConfiguration = "Release"
			
 
				+      selectedDebuggerIdentifier = ""
			
 
				+      selectedLauncherIdentifier = "Xcode.IDEFoundation.Launcher.PosixSpawn"
			
 
				+      launchStyle = "0"
			
 
				+      useCustomWorkingDirectory = "NO"
			
 
				+      ignoresPersistentStateOnLaunch = "NO"
			
 
				+      debugDocumentVersioning = "YES"
			
 
				+      debugServiceExtension = "internal"
			
 
				+      allowLocationSimulation = "YES"
			
 
				+      viewDebuggingEnabled = "No">
			
 
				+      <BuildableProductRunnable
			
 
				+         runnableDebuggingMode = "0">
			
 
				+         <BuildableReference
			
 
				+            BuildableIdentifier = "primary"
			
 
				+            BlueprintIdentifier = "757057102DDA334D00F5DBF8"
			
 
				+            BuildableName = "gputest"
			
 
				+            BlueprintName = "gputest"
			
 
				+            ReferencedContainer = "container:gputest.xcodeproj">
			
 
				+         </BuildableReference>
			
 
				+      </BuildableProductRunnable>
			
 
				+   </LaunchAction>
			
 
				+   <ProfileAction
			
 
				+      buildConfiguration = "Release"
			
 
				+      shouldUseLaunchSchemeArgsEnv = "YES"
			
 
				+      savedToolIdentifier = ""
			
 
				+      useCustomWorkingDirectory = "NO"
			
 
				+      debugDocumentVersioning = "YES">
			
 
				+      <BuildableProductRunnable
			
 
				+         runnableDebuggingMode = "0">
			
 
				+         <BuildableReference
			
 
				+            BuildableIdentifier = "primary"
			
 
				+            BlueprintIdentifier = "757057102DDA334D00F5DBF8"
			
 
				+            BuildableName = "gputest"
			
 
				+            BlueprintName = "gputest"
			
 
				+            ReferencedContainer = "container:gputest.xcodeproj">
			
 
				+         </BuildableReference>
			
 
				+      </BuildableProductRunnable>
			
 
				+   </ProfileAction>
			
 
				+   <AnalyzeAction
			
 
				+      buildConfiguration = "Debug">
			
 
				+   </AnalyzeAction>
			
 
				+   <ArchiveAction
			
 
				+      buildConfiguration = "Release"
			
 
				+      revealArchiveInOrganizer = "YES">
			
 
				+   </ArchiveAction>
			
 
				+</Scheme>
			
--- a/gputest.xcodeproj/xcuserdata/xcbosa.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/gputest.xcodeproj/xcuserdata/xcbosa.xcuserdatad/xcschemes/xcschememanagement.plist
@@ -10,5 +10,13 @@
 
				 			<integer>0</integer>
			
 
				 		</dict>
			
 
				 	</dict>
			
 
				+	<key>SuppressBuildableAutocreation</key>
			
 
				+	<dict>
			
 
				+		<key>757057102DDA334D00F5DBF8</key>
			
 
				+		<dict>
			
 
				+			<key>primary</key>
			
 
				+			<true/>
			
 
				+		</dict>
			
 
				+	</dict>
			
 
				 </dict>
			
 
				 </plist>
			
--- a/gputest/MetalShader.metal
+++ b/gputest/MetalShader.metal
@@ -0,0 +1,37 @@
 
				+#include <metal_stdlib>
			
 
				+using namespace metal;
			
 
				+
			
 
				+kernel void optimized_fp32(
			
 
				+    device float4 *buffer [[buffer(0)]],
			
 
				+    uint tid [[thread_position_in_grid]]
			
 
				+) {
			
 
				+    // 每个线程处理4个float元素
			
 
				+    float4 x = buffer[tid];
			
 
				+    
			
 
				+    // 展开循环增加计算强度
			
 
				+    for (int i = 0; i < 512; ++i) {
			
 
				+        // 每个循环8次浮点运算 x4 => 32 FLOPs/循环
			
 
				+        x = fma(x, float4(1.01), float4(0.97));
			
 
				+        x = fma(x, float4(0.99), float4(1.02));
			
 
				+        x = fma(x, float4(1.03), float4(0.98));
			
 
				+        x = fma(x, float4(0.96), float4(1.05));
			
 
				+    }
			
 
				+    
			
 
				+    buffer[tid] = x;
			
 
				+}
			
 
				+
			
 
				+kernel void optimized_fp16(
			
 
				+    device half4 *buffer [[buffer(0)]],
			
 
				+    uint tid [[thread_position_in_grid]]
			
 
				+) {
			
 
				+    half4 x = buffer[tid];
			
 
				+    
			
 
				+    for (int i = 0; i < 512; ++i) {
			
 
				+        x = fma(x, half4(1.01), half4(0.97));
			
 
				+        x = fma(x, half4(0.99), half4(1.02));
			
 
				+        x = fma(x, half4(1.03), half4(0.98));
			
 
				+        x = fma(x, half4(0.96), half4(1.05));
			
 
				+    }
			
 
				+    
			
 
				+    buffer[tid] = x;
			
 
				+}
			
--- a/gputest/main.swift
+++ b/gputest/main.swift
@@ -1,11 +1,122 @@
 
				-//
			
 
				-//  main.swift
			
 
				-//  gputest
			
 
				-//
			
 
				-//  Created by 邢铖 on 2025/5/18.
			
 
				-//
			
 
				-
			
 
				+import Metal
			
 
				+import MetalKit
			
 
				 import Foundation
			
 
				 
			
 
				-print("Hello, World!")
			
 
				+class OptimizedMetalBenchmark {
			
 
				+    private let device: MTLDevice
			
 
				+    private let commandQueue: MTLCommandQueue
			
 
				+    private let pipelineStateFP32: MTLComputePipelineState
			
 
				+    private let pipelineStateFP16: MTLComputePipelineState
			
 
				+    
			
 
				+    // 优化参数
			
 
				+    let elementsPerThread = 4  // 每个线程处理4个元素
			
 
				+    let loopUnrollFactor = 512 // 循环展开因子
			
 
				+    let arrayLength = 1 << 22  // 调整为4M元素（减少内存压力）
			
 
				+    let iterations = 100
			
 
				+    let threadgroupWidth = 256
			
 
				+    
			
 
				+    init() {
			
 
				+        guard let device = MTLCreateSystemDefaultDevice(),
			
 
				+              let commandQueue = device.makeCommandQueue() else {
			
 
				+            fatalError("Metal初始化失败")
			
 
				+        }
			
 
				+        
			
 
				+        self.device = device
			
 
				+        self.commandQueue = commandQueue
			
 
				+        
			
 
				+        let library = device.makeDefaultLibrary()!
			
 
				+        let fp32Function = library.makeFunction(name: "optimized_fp32")!
			
 
				+        let fp16Function = library.makeFunction(name: "optimized_fp16")!
			
 
				+        
			
 
				+        do {
			
 
				+            pipelineStateFP32 = try device.makeComputePipelineState(function: fp32Function)
			
 
				+            pipelineStateFP16 = try device.makeComputePipelineState(function: fp16Function)
			
 
				+        } catch {
			
 
				+            fatalError("创建计算管线失败: \(error)")
			
 
				+        }
			
 
				+        
			
 
				+        print("优化版Metal基准测试初始化完成 - 设备: \(device.name)")
			
 
				+    }
			
 
				+    
			
 
				+    func runOptimizedBenchmark() {
			
 
				+        print("\n运行优化版FP32测试...")
			
 
				+        let fp32Result = runTest(pipeline: pipelineStateFP32, precision: "FP32")
			
 
				+        
			
 
				+        print("\n运行优化版FP16测试...")
			
 
				+        let fp16Result = runTest(pipeline: pipelineStateFP16, precision: "FP16")
			
 
				+        
			
 
				+        print("\n最终结果:")
			
 
				+        print("FP32峰值性能: \(String(format: "%.2f", fp32Result)) TFLOPS")
			
 
				+        print("FP16峰值性能: \(String(format: "%.2f", fp16Result)) TFLOPS")
			
 
				+    }
			
 
				+    
			
 
				+    private func runTest(pipeline: MTLComputePipelineState, precision: String) -> Double {
			
 
				+        let bufferSize = arrayLength * MemoryLayout<Float>.size
			
 
				+        guard let buffer = device.makeBuffer(length: bufferSize, options: .storageModePrivate) else {
			
 
				+            fatalError("缓冲区创建失败")
			
 
				+        }
			
 
				+        
			
 
				+        let threadsPerGrid = arrayLength / elementsPerThread
			
 
				+        let threadgroups = MTLSize(
			
 
				+            width: (threadsPerGrid + threadgroupWidth - 1) / threadgroupWidth,
			
 
				+            height: 1,
			
 
				+            depth: 1
			
 
				+        )
			
 
				+        
			
 
				+        let threadgroupSize = MTLSize(width: threadgroupWidth, height: 1, depth: 1)
			
 
				+        
			
 
				+        // 预热
			
 
				+        for _ in 0..<3 {
			
 
				+            runKernel(pipeline: pipeline, buffer: buffer,
			
 
				+                     threadsPerGrid: threadsPerGrid,
			
 
				+                     threadgroups: threadgroups,
			
 
				+                     threadgroupSize: threadgroupSize)
			
 
				+        }
			
 
				+        
			
 
				+        // 正式测试
			
 
				+        let start = CFAbsoluteTimeGetCurrent()
			
 
				+        for _ in 0..<iterations {
			
 
				+            runKernel(pipeline: pipeline, buffer: buffer,
			
 
				+                     threadsPerGrid: threadsPerGrid,
			
 
				+                     threadgroups: threadgroups,
			
 
				+                     threadgroupSize: threadgroupSize)
			
 
				+        }
			
 
				+        let elapsed = CFAbsoluteTimeGetCurrent() - start
			
 
				+        
			
 
				+        // 计算FLOPs: 每个线程处理4个元素，每次循环128次，每次循环8 FLOPs
			
 
				+        let totalFLOPs = Double(arrayLength) * Double(loopUnrollFactor) * 8.0 * Double(iterations)
			
 
				+        let tflops = (totalFLOPs / elapsed) / 1e12
			
 
				+        
			
 
				+        print("\(precision) 结果:")
			
 
				+        print("- 总时间: \(String(format: "%.3f", elapsed))秒")
			
 
				+        print("- 理论计算量: \(String(format: "%.1f", totalFLOPs/1e12)) TFLOP")
			
 
				+        print("- 实测性能: \(String(format: "%.2f", tflops)) TFLOPS")
			
 
				+        
			
 
				+        return tflops
			
 
				+    }
			
 
				+    
			
 
				+    private func runKernel(pipeline: MTLComputePipelineState,
			
 
				+                          buffer: MTLBuffer,
			
 
				+                          threadsPerGrid: Int,
			
 
				+                          threadgroups: MTLSize,
			
 
				+                          threadgroupSize: MTLSize) {
			
 
				+        guard let commandBuffer = commandQueue.makeCommandBuffer(),
			
 
				+              let encoder = commandBuffer.makeComputeCommandEncoder() else {
			
 
				+            fatalError("创建命令对象失败")
			
 
				+        }
			
 
				+        
			
 
				+        encoder.setComputePipelineState(pipeline)
			
 
				+        encoder.setBuffer(buffer, offset: 0, index: 0)
			
 
				+        
			
 
				+        encoder.dispatchThreadgroups(threadgroups,
			
 
				+                                    threadsPerThreadgroup: threadgroupSize)
			
 
				+        encoder.endEncoding()
			
 
				+        
			
 
				+        commandBuffer.commit()
			
 
				+        commandBuffer.waitUntilCompleted()
			
 
				+    }
			
 
				+}
			
 
				 
			
 
				+// 运行优化版测试
			
 
				+let benchmark = OptimizedMetalBenchmark()
			
 
				+benchmark.runOptimizedBenchmark()