12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- //
- // MetalSource.swift
- // gputest
- //
- // Created by xingcheng on 2025/5/18.
- //
- let metalCode = """
- #include <metal_stdlib>
- using namespace metal;
- kernel void optimized_fp32(
- device float4 *buffer [[buffer(0)]],
- uint tid [[thread_position_in_grid]]
- ) {
- // 每个线程处理4个float元素
- float4 x = buffer[tid];
-
- // 展开循环增加计算强度
- for (int i = 0; i < 512; ++i) {
- // 每个循环8次浮点运算 x4 => 32 FLOPs/循环
- x = fma(x, float4(1.01), float4(0.97));
- x = fma(x, float4(0.99), float4(1.02));
- x = fma(x, float4(1.03), float4(0.98));
- x = fma(x, float4(0.96), float4(1.05));
- }
-
- buffer[tid] = x;
- }
- kernel void optimized_fp16(
- device half4 *buffer [[buffer(0)]],
- uint tid [[thread_position_in_grid]]
- ) {
- half4 x = buffer[tid];
-
- for (int i = 0; i < 512; ++i) {
- x = fma(x, half4(1.01), half4(0.97));
- x = fma(x, half4(0.99), half4(1.02));
- x = fma(x, half4(1.03), half4(0.98));
- x = fma(x, half4(0.96), half4(1.05));
- }
-
- buffer[tid] = x;
- }
- """
|