// // MetalSource.swift // gputest // // Created by xingcheng on 2025/5/18. // let metalCode = """ #include using namespace metal; kernel void optimized_fp32( device float4 *buffer [[buffer(0)]], uint tid [[thread_position_in_grid]] ) { // 每个线程处理4个float元素 float4 x = buffer[tid]; // 展开循环增加计算强度 for (int i = 0; i < 512; ++i) { // 每个循环8次浮点运算 x4 => 32 FLOPs/循环 x = fma(x, float4(1.01), float4(0.97)); x = fma(x, float4(0.99), float4(1.02)); x = fma(x, float4(1.03), float4(0.98)); x = fma(x, float4(0.96), float4(1.05)); } buffer[tid] = x; } kernel void optimized_fp16( device half4 *buffer [[buffer(0)]], uint tid [[thread_position_in_grid]] ) { half4 x = buffer[tid]; for (int i = 0; i < 512; ++i) { x = fma(x, half4(1.01), half4(0.97)); x = fma(x, half4(0.99), half4(1.02)); x = fma(x, half4(1.03), half4(0.98)); x = fma(x, half4(0.96), half4(1.05)); } buffer[tid] = x; } """