Unnecessary nan-checks: performance issue or missing compile options.

I'm not sure whether this is a performance issue or a feature request. I figured lets ask here first.

The issue is a performance regression due to unnecessary nan-check for with (eg.) max and min operations.

from random import random_ui64
from time import now

fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]:
    var result = SIMD[T, width]()
    for i in range(width):
        result[i] = random_ui64(0, 100).cast[T]()
    return result

fn main():
    let data0 = gen_random_SIMD[DType.float64, 8]()
    let data1 = gen_random_SIMD[DType.float64, 8]()
    
    let start_time_ns = now()
    let data2 = data0.max(data1)  # we interested in how max is handled.
    let elapsed_time_ns = now() - start_time_ns

    print(data2)
    print("Elapsed time " + str(elapsed_time_ns) + " ns")

from random import random_ui64
from time import now

fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]:
    var result = SIMD[T, width]()
    for i in range(width):
        result[i] = random_ui64(0, 100).cast[T]()
    return result

fn main():
    let data0 = gen_random_SIMD[DType.float64, 8]()
    let data1 = gen_random_SIMD[DType.float64, 8]()
    
    let start_time_ns = now()
    let data2 = data0.max(data1)  # we interested in how max is handled.
    let elapsed_time_ns = now() - start_time_ns

    print(data2)
    print("Elapsed time " + str(elapsed_time_ns) + " ns")

from random import random_ui64
from time import now

fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]:
    var result = SIMD[T, width]()
    for i in range(width):
        result[i] = random_ui64(0, 100).cast[T]()
    return result

fn main():
    let data0 = gen_random_SIMD[DType.float64, 8]()
    let data1 = gen_random_SIMD[DType.float64, 8]()
    
    let start_time_ns = now()
    let data2 = data0.max(data1)  # we interested in how max is handled.
    let elapsed_time_ns = now() - start_time_ns

    print(data2)
    print("Elapsed time " + str(elapsed_time_ns) + " ns")

from random import random_ui64
from time import now

fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]:
    var result = SIMD[T, width]()
    for i in range(width):
        result[i] = random_ui64(0, 100).cast[T]()
    return result

fn main():
    let data0 = gen_random_SIMD[DType.float64, 8]()
    let data1 = gen_random_SIMD[DType.float64, 8]()
    
    let start_time_ns = now()
    let data2 = data0.max(data1)  # we interested in how max is handled.
    let elapsed_time_ns = now() - start_time_ns

    print(data2)
    print("Elapsed time " + str(elapsed_time_ns) + " ns")

<+278>:   call   0x5470 <clock_gettime@plt>
<+283>:   mov    rbx,QWORD PTR [rsp+0x40]
<+288>:   mov    rax,QWORD PTR [rsp+0x48]
<+293>:   mov    QWORD PTR [rsp+0x70],rax
<+298>:   vmovapd zmm0,ZMMWORD PTR [rsp+0xc0]
<+306>:   vmovapd zmm2,ZMMWORD PTR [rsp+0x100]
<+314>:   vmaxpd zmm1,zmm2,zmm0
<+320>:   vcmpunordpd k1,zmm0,zmm0
<+327>:   vmovapd zmm1{k1},zmm2
<+333>:   vmovapd ZMMWORD PTR [rsp+0xc0],zmm1
...
<+364>:   call   0x5470 <clock_gettime@plt>

<+278>:   call   0x5470 <clock_gettime@plt>
<+283>:   mov    rbx,QWORD PTR [rsp+0x40]
<+288>:   mov    rax,QWORD PTR [rsp+0x48]
<+293>:   mov    QWORD PTR [rsp+0x70],rax
<+298>:   vmovapd zmm0,ZMMWORD PTR [rsp+0xc0]
<+306>:   vmovapd zmm2,ZMMWORD PTR [rsp+0x100]
<+314>:   vmaxpd zmm1,zmm2,zmm0
<+320>:   vcmpunordpd k1,zmm0,zmm0
<+327>:   vmovapd zmm1{k1},zmm2
<+333>:   vmovapd ZMMWORD PTR [rsp+0xc0],zmm1
...
<+364>:   call   0x5470 <clock_gettime@plt>

<+278>:   call   0x5470 <clock_gettime@plt>
<+283>:   mov    rbx,QWORD PTR [rsp+0x40]
<+288>:   mov    rax,QWORD PTR [rsp+0x48]
<+293>:   mov    QWORD PTR [rsp+0x70],rax
<+298>:   vmovapd zmm0,ZMMWORD PTR [rsp+0xc0]
<+306>:   vmovapd zmm2,ZMMWORD PTR [rsp+0x100]
<+314>:   vmaxpd zmm1,zmm2,zmm0
<+320>:   vcmpunordpd k1,zmm0,zmm0
<+327>:   vmovapd zmm1{k1},zmm2
<+333>:   vmovapd ZMMWORD PTR [rsp+0xc0],zmm1
...
<+364>:   call   0x5470 <clock_gettime@plt>

<+278>:   call   0x5470 <clock_gettime@plt>
<+283>:   mov    rbx,QWORD PTR [rsp+0x40]
<+288>:   mov    rax,QWORD PTR [rsp+0x48]
<+293>:   mov    QWORD PTR [rsp+0x70],rax
<+298>:   vmovapd zmm0,ZMMWORD PTR [rsp+0xc0]
<+306>:   vmovapd zmm2,ZMMWORD PTR [rsp+0x100]
<+314>:   vmaxpd zmm1,zmm2,zmm0
<+320>:   vcmpunordpd k1,zmm0,zmm0
<+327>:   vmovapd zmm1{k1},zmm2
<+333>:   vmovapd ZMMWORD PTR [rsp+0xc0],zmm1
...
<+364>:   call   0x5470 <clock_gettime@plt>

+298 and +306 load data0 and data1
+314 calculates the maximum of zmm0 and zmm2 and store the result in zmm1 .
+320 mask register k1 is set when zmm0 (data0) contains nan-values.
+327 the result value (zmm1) is overwritten when the zmm0 was a nan with the value of data1 (zmm2)
+333 result value is written back to memory

If data0 could contain nan-values, the above assembly would be correct. But when data0 does not have such values, the code has a performance regression, because for every float min/max operations a nan-check is performed. This is something I would like to control in HPC AI workloads.

Q: Is this a regression bug or something else (for which i need to make a feature request)?

Henk-Jan LebbinkOP•2/10/24, 6:35 PM

Did some more digging:

alias T2 = SIMD[T, channels]
let data2: T2 = data0.max(data1)

alias T2 = SIMD[T, channels]
let data2: T2 = data0.max(data1)

is equivalent to: (i.c., generates the exact same assembly as)

alias T2 = SIMD[T, channels]
let data2: T2 = rebind[SIMD[T, channels]](llvm_intrinsic["llvm.maxnum", T2, T2, T2](data0, data1))

alias T2 = SIMD[T, channels]
let data2: T2 = rebind[SIMD[T, channels]](llvm_intrinsic["llvm.maxnum", T2, T2, T2](data0, data1))

The semantics of llvm.maxnumllvm.maxnum: dictate the observed nan behaviour:

llvm.maxnum.f64: This intrinsic computes the maximum value between two floating-point 
numbers (f64), but if one of the inputs is NaN, it returns the other input value. In other 
words, if one of the operands is NaN, the result will be the non-NaN operand. This behavior
is often referred to as "maximum number" semantics.

llvm.maxnum.f64: This intrinsic computes the maximum value between two floating-point 
numbers (f64), but if one of the inputs is NaN, it returns the other input value. In other 
words, if one of the operands is NaN, the result will be the non-NaN operand. This behavior
is often referred to as "maximum number" semantics.

Q: how to tell mojo that the parameters of max are larger than zero, (thus non NAN), or how to translate the following llvm into intrinsics:

%value = load float, float* %ptr
%is_positive = fcmp ogt float %value, 0.0
call void @llvm.assume(i1 %is_positive)

%value = load float, float* %ptr
%is_positive = fcmp ogt float %value, 0.0
call void @llvm.assume(i1 %is_positive)

benny•2/11/24, 12:56 AM

llvm.umax?

Bbenny llvm.umax?

sora•2/11/24, 12:57 AM

That's for integers.

Ssora That's for integers.

benny•2/11/24, 12:58 AM

your right, my mistake

Henk-Jan LebbinkOP•2/11/24, 8:23 PM

Did some more research and tried to solve the issue in MLIR by adding a 'fast' flag.

let d0: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]()
let d1: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]() #let d0: SIMD[DType.float32, 16] = 10.
#let d1: SIMD[DType.float32, 16] = 20.

let d0x = rebind[__mlir_type.`vector<16xf32>`](d0)
let d1x = rebind[__mlir_type.`vector<16xf32>`](d1)

let start_time_ns = now()
var tmp = __mlir_op.`llvm.intr.maxnum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](d0x, d1x)
let elapsed_time_ns = now() - start_time_ns

let d2 = rebind[SIMD[DType.float32, 16]](tmp)
print(d2)
print("Elapsed time " + str(elapsed_time_ns) + " ns")

let d0: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]()
let d1: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]() #let d0: SIMD[DType.float32, 16] = 10.
#let d1: SIMD[DType.float32, 16] = 20.

let d0x = rebind[__mlir_type.`vector<16xf32>`](d0)
let d1x = rebind[__mlir_type.`vector<16xf32>`](d1)

let start_time_ns = now()
var tmp = __mlir_op.`llvm.intr.maxnum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](d0x, d1x)
let elapsed_time_ns = now() - start_time_ns

let d2 = rebind[SIMD[DType.float32, 16]](tmp)
print(d2)
print("Elapsed time " + str(elapsed_time_ns) + " ns")

This works, but only when the data (d0 and d1) has constant values.

Does anyone know how to get the above code to work?
the first two binds somehow yield null values:

%65 = "llvm.intr.maxnum"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmathFlags = #llvm.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> vector<16xf32>

%65 = "llvm.intr.maxnum"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmathFlags = #llvm.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> vector<16xf32>