Home	CV	Publications	PhD	PostDoc	SIMD
Android	Glass	Astronomy	Retro	Games	Lego

Errata and Code Samples for The Software Vectorization Handbook

This page contains supplemental material for the following book.

The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance. ISBN 0-9743649-2-4, Intel Press, June, 2004.

Errata for First Printing

page 25: replace "a2-b3" with "a2-b2" in destination (courtesy Naftali Schwartz).
page 27: replace "stores the 16-bit sums" with "stores the 32-bit sums".
page 35: replace the immediate 230 with 245 in the shufps that is equal to movshdup.
page 38: change theoretical peak into 12GFLOPS for single- and 6GFLOPS for double-precision FP computations.
page 94: add "T" to two occurrences of "pcmpeq" in expansion table (viz. "pcmpeqT") and "d" to "pcmpeq" in example at the bottom (viz. "pcmpeqd").
page 94: replace "constants -1 and -17" with "constants -1 and 17" (courtesy Naftali Schwartz).
page 112: replace "pcmpneb,w,d" entries in Table 5.8 with "emulate with pcmpeqb,w,d" (no direct support of "ne" instructions).
page 113: remove pcmpneb,w row completely from table (no direct support of "ne" instructions).
page 127: remove ")" from caption of Figure 6.2.
page 190: remove "is" from "vector implementation is shown above".
page 209: remove "LOOP" from "BLOCK LOOP WAS VECTORIZED" in Table 9.5.
page 215: add brackets "(" ")" around switches "-O3 -Qipo -Qprof_use -QxP".

Errata for Second Printing

None known yet.

Code Samples

/* Code Samples from the Software Vectorization Handbook
 * By Aart J.C. Bik, (C) 2004.
 * https://www.aartbik.com/
 *
 * This software is *not* in the public domain. However, it is freely available
 * without any fee for education, research, and non-profit purposes. 
 */

// shorthand type definitions

typedef unsigned char      u8;
typedef signed   char      s8;
typedef unsigned short     u16;
typedef signed   short     s16;
typedef unsigned int       u32;
typedef signed   int       s32;
typedef unsigned long long u64;
typedef signed   long long s64;

typedef float    f32;
typedef double   f64;

// saturation, page 14

#define N 128

static u16 x[N], y[N], z[N];

void sat(int n) {
  int i;
  for (i = 0; i < n; i++) {
    int t = x[i] + y[i];
    z[i] = (t <= 65535) ? t : 65535;
  }
}

// floating-point precision, pages 75-76

#define N 128

float a[N];

void addSP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] = a[i] + 3.14159f;
  }
}

void addDP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] = a[i] + 3.14159;
  }
}

// framework, page 78

#define N 200

u8 x[N];

void setx(s32 n) {
  s32 i;
  for (i = 0; i < n; i++) x[i] = 0;
} 

void setx_where_n_is_16() {
  s32 i;
  for (i = 0; i < 16; i++) x[i] = 0;
}

// mixed-type loop, page 79

#define N 200

s8  x[N]; 
f64 a[N], b[N];

void mixed() {
  int i;
  for (i = 0; i < N; i++) {
    a[i] += 4.0;
    x[i] &= 0x0f;
    b[i] *= 2.0;
  }
}

// aligned and unaligned access patterns, page 81

u16 x[65];

void alignment() {
  int i;
  for (i = 0; i < 64; i++) {
    x[i] = x[i+1];
  }
}

// rotating read-only memory references, pages 81-82

u16 a[64];

static const u16 primes[] = { 2, 3, 5, 7, 11, 13, 17, 19 };

void rotate1() {
  int i;
  for (i = 0; i < 64; i++) {
    a[i] = primes[ (i+2) % 4 ];
  }
} 
  
void rotate2() {
  int i;
  for (i = 0; i < 64; i++) {
    a[i] = primes[ i % 8 ];
  }
}

// induction (T = u32, u8), pages 86-87

#define N 128
#define T u32

T v[N];

void induction(int n) {
  int i;
  u8 x = 253;
  for (i = 0; i < N; i++) {
    v[i] = (T) x;
    x++;
  }
}

// wide induction variable, page 87

s64 w[100];

void wide_induction(s64 x) {
  int i;
  for (i = 0; i < 100; i++) {
    w[i] = x;
    x += 4;
  }
}

// floating-point induction, page 88

f64 a[100], b;

void inducDP(int n) {
  int i;
  b = 3.0;
  for (i = 0; i < 100; i++) {
    a[i] = b;
    b += 1.5;
  }
}

// mixed-type induction, page 89

f32 c[100];

void mixed_induction() {
  int i;
  s32 g = 0x7fffff00;
  for (i = 0; i < 100; i++) {
    c[i] = (f32) g;
    g += 1;
  }
}

// direct floating-point induction, page 89

f64 a[100];

void direct_inducFP() {
  int i;
  for (i = 0; i < 100; i++) {
    a[i] = (f64) i;
  }
}

// bitwise-AND reduction, page 92

u32 msk[256];
u32 x;

void mask() {
  int i;
  for (i = 0; i < 256; i++) {
     x &= msk[i];
  }
}

// scalar variable, pages 94-95

s32 x[64], y[64], z[64], s;

void scalar() {
  int i;
  for (i = 0; i < 64; i++) {
     s = x[i];
     y[i] = s - 1;
     z[i] = s + 17;
  }
}

// multiplication, page 98

s16 u[256], v[256], w[256];

void mult() {
  int i;
  for (i = 0; i < 256; i++) {
    u[i] = v[i] * w[i];
  }
}

// division by 32, page 98

u64 x[256];

void div32() {
  int i;
  for (i = 0; i < 256; i++) {
    x[i] /= 32;
  }
}

// min recognition, page 101

u8 u[256];

u8 min() {
  int i;
  u8 x = 0xff;
  for (i = 0; i < 256; i++) {
    if (u[i] < x) x = u[i];
  }
  return x;
}

// type conversions, page 102

s32 u[256], v[256];
u32 w[256];

void shifts() {
  int i;
  for (i = 0; i < 256; i++) {
    u[i] = (v[i] >> 2) + (w[i] >> 2);
  }
}

// contrived type conversions that should vectorize, page 103

#define N 128

u8 x[N], y[N];

void contrived() {
  int i;
  for (i = 0; i < N; i++) {
    x[i] = ((u8) ((s32)((s8)((s32) y[i]))) );
  }
}

// type conversion, page 103

s16 x[128];

void conversion() {
  int i;
  for (i = 0; i < 128; i++) {
    x[i] = ((u8) x[i]);
  }
}

// floating-point type conversion, page 104

f32 a[128];
f64 b[128];
f64 c;

void conversionFP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] += b[i] * c;
  }
}

// float2double accumulation, page 105

f32 a[128]; 

f64 accumulate(void) {
  int i;
  f64 acc = 0;
  for (i = 0; i < 128; i++) {
    acc += a[i];
  }
  return acc;
}

// mixed-type conversion, page 106

f32 a[100], b[100]; 
s32 x[100];

void mixed_mult(int p) {
  int i;
  for (i = 0; i < 100; i++) {
    a[i] = b[i] * x[i];
  }
}

// math vectorization with SVML, page 108

f64 a[256], b[256], c[256];

void svml() {
  int i;
  for (i = 0; i < 256; i++) {
    a[i] = pow(a[i], b[i]) + cos(c[i]);
  }
}

// math vectorization with SVML, page 108

f32 a[256], b[256];

void sin_and_add() {
  int i;
  for (i = 0; i < 256; i++) {
    a[i] = sin( b[i] ) + 1;
  }
}

// bit-mask vectorization, page 111
// (pragma may be required to override efficiency heuristics)

#define N 100

s32 x[N], y[N];
s32 z[N];

void masked_if1() {
  int i;
#pragma vector always
  for (i = 0; i < N; i++) {
    if (x[i] == y[i]) 
       z[i] = 0;
  }
}

// bit-mask vectorization, page 113

s16 u[800];

void masked_if2() {
  int i;
  for (i = 0; i < 800; i++) {
    if (u[i] < 10) 
      u[i] = 0;
    else
      u[i] = 1;
  }
}

// guarded error conditions, page 114

#define N1 99
#define N2 100
#define N  300

s32 x[N1], y[N2], *p = NULL;

void guarded_error(int flag) {
  int i;
  if (flag) {
    p = malloc(sizeof(s32) * N);
  }
  for (i = 0; i < N; i++) {
    if (i < N1) x[i] = 0;
    if (i < N2) y[i] = 0;
    if (flag)   p[i] = 0;
  }
}

// conditional scalar assignment, page 115 

f64 a[128];
f64 acc = 0;

void cond_scalar() {
  int i;
  acc = 0;
  for (i = 0; i < 128; i++) {
    if (a[i] < -1) {
      f64 t = a[i] + 1;
      acc += t * t - t;
    }
  }
}

// 2-way conditional, page 116

f64 a[128], b[128];

void twoway_cond() {
  int i;
  for (i = 0; i < 128; i++) {
    f64 t;
    if (a[i] < 0) {
      t =  -1.0;
    }
    else {
      t = 1.0;
    }
    b[i] = t;
  }
}

// performance example, page 117

#define N 512 

f32 a[N], b[N], c[N];

void divabc(s32 n) {
  s32 i;
#pragma vector always
  for (i = 0; i < n; i++) {
    if (a[i] > 0) {
      a[i] = b[i] / c[i];
    }
  }
}

// cache line split optimizations, page 123

f32 a[40], b[41], c[42], d[43]; 

void cache_line_split() {
  int i;
  for (i = 0; i < 40; i++) {
     a[i] = b[i+1] * c[i+2] * d[i+3];
  }
}

// cache line split performance, page 126

#define N 256 

s32 x[N], y[N+1];

void alignment_perf(int n) {
  int i;
  for (i = 0; i < n; i++) {
     x[i] = (x[i] + y[i+1]) >> 1;
  }
}

// DDOT and SDOT, pages 136-137

f64 ddot(f64 *dx, f64 *dy, s32 n) {
  s32 i;
  f64 d = 0;
  for (i = 0; i < n; i++) { d += dx[i] * dy[i]; }
  return d;
}

f32 sdot(f32 *sx, f32 *sy, s32 n) {
  s32 i;
  f32 s = 0;
  for (i = 0; i < n; i++) { s += sx[i] * sy[i]; }
  return s;
}

// DAXPY and SAXPY, page 142

void daxpy(f64 *dx, f64 *dy, f64 s, s32 n) {
  s32 i;
  for (i = 0; i < n; i++) { 
    dy[i] += s * dx[i]; 
  }
}

void saxpy(f32 *sx, f32 *sy, f32 s, s32 n) {
  s32 i;
  for (i = 0; i < n; i++) { 
    sy[i] += s * sx[i]; 
  }
}

// conversion idiom, page 146

#define N 100

u8  x[N];
s16 y[N];

void conversion_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    y[i] += x[i];
  }
}

// conversion idiom, page 147

#define N 100

f32 a[N];
s8  z[N];

void conversion_idiom2(int n) {
  int i;
  for (i = 0; i < n; i++) {
    a[i] = z[i];
  }
}

// arithmetic idiom, page 147

#define N 100

s16 x[N], y[N], z[N];

void arith_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    x[i] = (y[i] * z[i]) >> 18;
  }
}

// arithmetic idiom, page 148

s16 u[256];
s32 v[256];

void arith_idiom2(int n) {
  int i;
  for (i = 0; i < n; i++) {
    v[i] = u[i] * u[i];
  }
}

// arithmetic idiom, page 149

#define N 100

u8 x[N], y[N];

void arith_idiom3(int n) {
  int i;
  for (i = 0; i < n; i++) {
    x[i] = (x[i] + y[i] + 1) >> 1;
  }
}

// reduction idiom, pages 149-150

#define N 100

u8 x[N], y[N];

s32 red;

void reduction_idiom(int n) {
  int i;
  red = 0;
  for (i = 0; i < n; i++) {
    s32 temp = x[i] - y[i];
    if (temp < 0) temp = -temp;
    red += temp;
  }
}

// saturation idiom, pages 151 and 156

#define N 100

u16 x[N], y[N], z[N];

void sat_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    s32 t = x[i] + y[i];
    z[i] = (t <= 65535) ? t : 65535;
  }
}

// saturation idiom, page 156

u8  x[256];
s16 y[256];

void sat_idiom2(int n) {
  int i;
  for (i = 0; i < 256; i++) {
    s32 t = y[i];
    if (t < 0) t = 0;
    if (t > 255) t = 255;
    x[i] = t;
  }
}

// saturation idiom, page 157

#define N 100

u8 u[N], v[N];

void sat_idiom3(int n) {
  int i;
  for (i = 0; i < n; i++) {
    s32 x = (u[i] < 200) ? u[i]+55:255;
    if (x > v[i]) v[i] = x;
  }
}

// gzip flavored loop, page 159

#define N 256

u16 head[N];

void sat_idiom4(int n) {
  int i;
  for (i = 0; i < n; i++) {
    u32 m = head[i]; 
    head[i] = (m >= 32768) ? m-32768 : 0;
  }
}

// search loop, page 161

#define N 32 

float a[N+1];

int search(float x) {
  int i;
  for (i = 0; i < N; i++) {
    if (a[i] == x) {
      return i;
    }
  }
  return -1; /* not found */
}

// loop materialization, page 177

f32 a[4];
f32 b[4];
f32 c[4];

void ordered() {
  a[0] = b[0] * c[0];
  a[1] = b[1] * c[1];
  a[2] = b[2] * c[2];
  a[3] = b[3] * c[3];
} 
  
void unordered() {
  a[0] = b[0] * c[0];
  a[3] = b[3] * c[3];
  a[2] = b[2] * c[2];
  a[1] = b[1] * c[1];
}

// loop materialization and collapsing, page 181

struct { f32 x, y; } a[100];

void reset_structs() {
int i;
for (i = 0; i < 100; i++) {
   a[i].x = 0;
   a[i].y = 0;
}
}

// loop materialization on local variables, page 183

s32 z[4];

void f(void) {
   s32 i, j, k, l;
   z[0] = i+8;
   z[2] = j+4;
   z[1] = k+9;
   z[3] = l+2;
}

// loop materialization and collapsing, page 184

typedef struct { f32 x, y;     } pair;
typedef struct { pair e[2][2]; } matrix;

void add(matrix *a, f32 s) {
  s32 i, j;
  for (i=0;i<2;i++) {
    for (j=0;j<2;j++){
      a->e[i][j].x += s;
      a->e[i][j].y += s;
    }
  }
}

// loop materialization and collapsing, page 189

void m4(f32 *dest, f32 *src, s32 n)
{
  s32 i;
  for (i = 0; i < n; i++) {
       *(dest)   = 0.5f * *(src);
       *(dest+1) = 0.2f * *(src+1);
       *(dest+2) = 0.1f * *(src+2);
       *(dest+3) = 0.4f * *(src+3);
       src  += 4;
       dest += 4;
 }
}

// This example was used at page 195 to demonstrate compiler diagnostics.
// Make sure comments match the line numbers in the file.

/* 01 */ #define N 32
/* 02 */ float a[N], b[N], c[N], d[N];
/* 03 */
/* 04 */ doit() {
/* 05 */  int i;
/* 06 */  for (i = 0; i < N/2; i++) a[i] = i;
/* 07 */  for (i = 1; i < N; i++) {
/* 08 */   b[i] = b[i-1] + 2;  /* data dependence cycle */
/* 09 */   c[i] = 1;
/* 10 */  }
/* 11 */  d[0] = 10;
/* 12 */  d[1] = 10;
/* 13 */  d[2] = 10;
/* 14 */  d[3] = 10;
/* 15 */}