The following code: #include <mmintrin.h> #include <immintrin.h> int fn(int x) { __m64 mm = _mm_set1_pi32(x); mm = _mm_packs_pi16(mm, mm); __m128i xmm = _mm_movpi64_epi64(mm); xmm = _mm_packs_epi16(xmm, xmm); return _mm_cvtsi128_si32(xmm); } Compiled with '-O2 -Wall -mavx2 -m32 -fomit-frame-pointer' produces: fn(int): sub esp, 20 vbroadcastss xmm0, dword ptr [esp + 24] # Cool idea, but not vmovlps qword ptr [esp + 8], xmm0 # in our context. movq mm0, qword ptr [esp + 8] # !!! packsswb mm0, mm0 movq qword ptr [esp], mm0 # These moves are vmovq xmm0, qword ptr [esp] # correct. vpacksswb xmm0, xmm0, xmm0 vmovd eax, xmm0 add esp, 20 ret I know that MMX is not used anymore, but I wonder why clang prefers a code-path that is one instruction longer and contains 2 memory accesses more than a more straightforward 'punpckldq'.
Probably easiest if we just redefine the _mm_set1_pi* intrinsics along the lines of: static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi32(int __i) { __m64 src = _mm_cvtsi32_si64(__i); return _mm_unpacklo_pi32(src, src); } static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi16(short __i) { __m64 src = _mm_cvtsi32_si64(__i); src = _mm_unpacklo_pi16(src, src); return _mm_unpacklo_pi32(src, src); } static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi8(char __i) { __m64 src = _mm_cvtsi32_si64(__i); src = _mm_unpacklo_pi8(src, src); src = _mm_unpacklo_pi16(src, src); return _mm_unpacklo_pi32(src, src); } Craig - Any thoughts?
Instead of hacking the headers which would limit constant folding, I went for a DAG solution here: https://reviews.llvm.org/D43618 (and https://reviews.llvm.org/D43616)
rL327247
Thanks! I just checked it out and it works as expected.