from algorithm import vectorize
alias dtype=DType.float32
alias SIMD_WIDTH = 2*simdwidthof[dtype]()
alias NUM = 32
fn main():
var v = DTypePointer[dtype]().alloc(NUM)
for i in range(NUM):
v[i] = i*0.2932
fn f1() -> Float32:
var val:Float32 = 0.0
for i in range(NUM):
val += v[i]
return val
fn f2() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
for j in range(width):
val += v[iv+j]
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
fn f3() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
for j in range(width):
val += v[iv+width-j-1]
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
fn f4() -> Float32:
var val:Float32 = 0.0
for i in range(NUM):
val += v[NUM-i-1]
return val
fn f5() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
val += v.load[width=width](iv).reduce_add[1]()
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
print("f1:",f1())
print("f2:",f2(),"\n")
print("f3:",f3(),"\n")
print("f4:",f4())
print("f5:",f5())
from algorithm import vectorize
alias dtype=DType.float32
alias SIMD_WIDTH = 2*simdwidthof[dtype]()
alias NUM = 32
fn main():
var v = DTypePointer[dtype]().alloc(NUM)
for i in range(NUM):
v[i] = i*0.2932
fn f1() -> Float32:
var val:Float32 = 0.0
for i in range(NUM):
val += v[i]
return val
fn f2() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
for j in range(width):
val += v[iv+j]
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
fn f3() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
for j in range(width):
val += v[iv+width-j-1]
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
fn f4() -> Float32:
var val:Float32 = 0.0
for i in range(NUM):
val += v[NUM-i-1]
return val
fn f5() -> Float32:
var val:Float32 = 0.0
@parameter
fn _op[width: Int](iv: Int):
val += v.load[width=width](iv).reduce_add[1]()
vectorize[_op, SIMD_WIDTH](size=NUM)
return val
print("f1:",f1())
print("f2:",f2(),"\n")
print("f3:",f3(),"\n")
print("f4:",f4())
print("f5:",f5())