/* Hello-world with SSE 4.2 intrinsics.
 */
#include <stdio.h>
#include <stdint.h>
#include <stdalign.h>

#include <x86intrin.h>

int main(int argc, char **argv)
{
  /* __m128 is four floats, __m128d is two doubles, __m64 is whatever
     MMX thing.  _epi16 is extended (128-bit) packed integers of 16
     bits.  mullo is the low-order bits of the multiplication result,
     and mulhi is the high-order bits, but be careful about that when
     it’s negative. */
  __m128i a = _mm_set_epi16(225, 210, 42, 99, -32767, -32768, -5, 0)
    , b = _mm_set_epi16(0, 11, 2, -64, -1, 2, -32767, 1024)
    , c = _mm_add_epi16(a, b)
    , d = _mm_mullo_epi16(a, b)
    , e = _mm_mulhi_epi16(a, b)
    ;
  /* See <https://stackoverflow.com/a/46752535> for why this is OK. */
  alignas(16) int16_t ai[8], bi[8], ci[8], di[8], ei[8];
  /* si128 is a 128-bit vector. */
  _mm_store_si128((__m128i*)&ai, a);
  _mm_store_si128((__m128i*)&bi, b);
  _mm_store_si128((__m128i*)&ci, c);
  _mm_store_si128((__m128i*)&di, d);
  _mm_store_si128((__m128i*)&ei, e);
  for (int i = 0; i < 8; i++) {
    printf("%d + %d = %d\n", (int)ai[i], (int)bi[i], (int)ci[i]);
    printf("%d · %d = %d (+ %d)\n",
           (int)ai[i], (int)bi[i], (int)di[i], (int)ei[i] * 65536);
  }
  return 0;
}