theres very little documentation for FFmpeg, so a lot of what i wrote is things sussed out from othe
theres very little documentation for FFmpeg, so a lot of what i wrote is things sussed out from other projects
_mm_add_psEXTERN_C DLLENTRY void DLLCALL conv_yuv444_to_rgb_sse_thr(const int threadcount, size_t length, byte* Y, byte* U, byte* V, vec4u8* rgba)
{
_CSR_SAVE();
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#pragma omp parallel for num_threads(threadcount)
for (__int64 i = 0; i < length; i++)
{
__m128 frgba = { 0,0,0,0 };
__m128 yuv0 = __m128{Y[i] * inv255, U[i] * inv255, V[i] * inv255, 1};
yuv0 =_mm_add_ps(yuv0, offset);
frgba = _mm_add_ps(frgba, _mm_dp_ps(yuv0, Rcoeff, 0x71)); // or reverse???
frgba = _mm_add_ps(frgba, _mm_dp_ps(yuv0, Gcoeff, 0x72)); // or reverse???
frgba = _mm_add_ps(frgba, _mm_dp_ps(yuv0, Bcoeff, 0x74)); // or reverse???
frgba = _mm_min_ps(frgba, one);
frgba = _mm_max_ps(frgba, zero);
rgba[i] = vec4u8{
byte(frgba.m128_f32[0] * 255),
byte(frgba.m128_f32[1] * 255),
byte(frgba.m128_f32[2] * 255),
255
};
}
_CSR_RESTORE();
}