Code Samples from The Software Vectorization Handbook

This page contains code samples that were used in: The book is out of print, but drop Aart an email if you are interested in a copy.
[BIKVEC04]
/* Code Samples from the Software Vectorization Handbook
 * By Aart J.C. Bik, (C) 2004.
 * http://www.aartbik.com/
 *
 * This software is *not* in the public domain. However, it is freely available
 * without any fee for education, research, and non-profit purposes. 
 */

// shorthand type definitions

typedef unsigned char      u8;
typedef signed   char      s8;
typedef unsigned short     u16;
typedef signed   short     s16;
typedef unsigned int       u32;
typedef signed   int       s32;
typedef unsigned long long u64;
typedef signed   long long s64;

typedef float    f32;
typedef double   f64;

// saturation, page 14

#define N 128

static u16 x[N], y[N], z[N];

void sat(int n) {
  int i;
  for (i = 0; i < n; i++) {
    int t = x[i] + y[i];
    z[i] = (t <= 65535) ? t : 65535;
  }
}

// floating-point precision, pages 75-76

#define N 128

float a[N];

void addSP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] = a[i] + 3.14159f;
  }
}

void addDP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] = a[i] + 3.14159;
  }
} 

// framework, page 78

#define N 200

u8 x[N];

void setx(s32 n) {
  s32 i;
  for (i = 0; i < n; i++) x[i] = 0;
} 

void setx_where_n_is_16() {
  s32 i;
  for (i = 0; i < 16; i++) x[i] = 0;
} 

// mixed-type loop, page 79

#define N 200

s8  x[N]; 
f64 a[N], b[N];

void mixed() {
  int i;
  for (i = 0; i < N; i++) {
    a[i] += 4.0;
    x[i] &= 0x0f;
    b[i] *= 2.0;
  }
} 

// aligned and unaligned access patterns, page 81

u16 x[65];

void alignment() {
  int i;
  for (i = 0; i < 64; i++) {
    x[i] = x[i+1];
  }
} 

// rotating read-only memory references, pages 81-82

u16 a[64];

static const u16 primes[] = { 2, 3, 5, 7, 11, 13, 17, 19 };

void rotate1() {
  int i;
  for (i = 0; i < 64; i++) {
    a[i] = primes[ (i+2) % 4 ];
  }
} 
  
void rotate2() {
  int i;
  for (i = 0; i < 64; i++) {
    a[i] = primes[ i % 8 ];
  }
} 

// induction (T = u32, u8), pages 86-87

#define N 128
#define T u32

T v[N];

void induction(int n) {
  int i;
  u8 x = 253;
  for (i = 0; i < N; i++) {
    v[i] = (T) x;
    x++;
  }
} 
 

// wide induction variable, page 87

s64 w[100];

void wide_induction(s64 x) {
  int i;
  for (i = 0; i < 100; i++) {
    w[i] = x;
    x += 4;
  }
} 

// floating-point induction, page 88

f64 a[100], b;

void inducDP(int n) {
  int i;
  b = 3.0;
  for (i = 0; i < 100; i++) {
    a[i] = b;
    b += 1.5;
  }
} 

// mixed-type induction, page 89

f32 c[100];

void mixed_induction() {
  int i;
  s32 g = 0x7fffff00;
  for (i = 0; i < 100; i++) {
    c[i] = (f32) g;
    g += 1;
  }
} 

// direct floating-point induction, page 89

f64 a[100];

void direct_inducFP() {
  int i;
  for (i = 0; i < 100; i++) {
    a[i] = (f64) i;
  }
}

// bitwise-AND reduction, page 92

u32 msk[256];
u32 x;

void mask() {
  int i;
  for (i = 0; i < 256; i++) {
     x &= msk[i];
  }
} 

// scalar variable, pages 94-95

s32 x[64], y[64], z[64], s;

void scalar() {
  int i;
  for (i = 0; i < 64; i++) {
     s = x[i];
     y[i] = s - 1;
     z[i] = s + 17;
  }
} 

// multiplication, page 98

s16 u[256], v[256], w[256];

void mult() {
  int i;
  for (i = 0; i < 256; i++) {
    u[i] = v[i] * w[i];
  }
} 

// division by 32, page 98

u64 x[256];

void div32() {
  int i;
  for (i = 0; i < 256; i++) {
    x[i] /= 32;
  }
}

// min recognition, page 101

u8 u[256];

u8 min() {
  int i;
  u8 x = 0xff;
  for (i = 0; i < 256; i++) {
    if (u[i] < x) x = u[i];
  }
  return x;
} 

// type conversions, page 102

s32 u[256], v[256];
u32 w[256];

void shifts() {
  int i;
  for (i = 0; i < 256; i++) {
    u[i] = (v[i] >> 2) + (w[i] >> 2);
  }
}

// contrived type conversions that should vectorize, page 103

#define N 128

u8 x[N], y[N];

void contrived() {
  int i;
  for (i = 0; i < N; i++) {
    x[i] = ((u8) ((s32)((s8)((s32) y[i]))) );
  }
} 

// type conversion, page 103

s16 x[128];

void conversion() {
  int i;
  for (i = 0; i < 128; i++) {
    x[i] = ((u8) x[i]);
  }
} 

// floating-point type conversion, page 104

f32 a[128];
f64 b[128];
f64 c;

void conversionFP() {
  int i;
  for (i = 0; i < 128; i++) {
    a[i] += b[i] * c;
  }
}

// float2double accumulation, page 105

f32 a[128]; 

f64 accumulate(void) {
  int i;
  f64 acc = 0;
  for (i = 0; i < 128; i++) {
    acc += a[i];
  }
  return acc;
}

// mixed-type conversion, page 106

f32 a[100], b[100]; 
s32 x[100];

void mixed_mult(int p) {
  int i;
  for (i = 0; i < 100; i++) {
    a[i] = b[i] * x[i];
  }
}

// math vectorization with SVML, page 108

f64 a[256], b[256], c[256];

void svml() {
  int i;
  for (i = 0; i < 256; i++) {
    a[i] = pow(a[i], b[i]) + cos(c[i]);
  }
}

// math vectorization with SVML, page 108

f32 a[256], b[256];

void sin_and_add() {
  int i;
  for (i = 0; i < 256; i++) {
    a[i] = sin( b[i] ) + 1;
  }
} 

// bit-mask vectorization, page 111
// (pragma may be required to override efficiency heuristics)

#define N 100

s32 x[N], y[N];
s32 z[N];

void masked_if1() {
  int i;
#pragma vector always
  for (i = 0; i < N; i++) {
    if (x[i] == y[i]) 
       z[i] = 0;
  }
} 

// bit-mask vectorization, page 113

s16 u[800];

void masked_if2() {
  int i;
  for (i = 0; i < 800; i++) {
    if (u[i] < 10) 
      u[i] = 0;
    else
      u[i] = 1;
  }
} 

// guarded error conditions, page 114

#define N1 99
#define N2 100
#define N  300

s32 x[N1], y[N2], *p = NULL;

void guarded_error(int flag) {
  int i;
  if (flag) {
    p = malloc(sizeof(s32) * N);
  }
  for (i = 0; i < N; i++) {
    if (i < N1) x[i] = 0;
    if (i < N2) y[i] = 0;
    if (flag)   p[i] = 0;
  }
}

// conditional scalar assignment, page 115 

f64 a[128];
f64 acc = 0;

void cond_scalar() {
  int i;
  acc = 0;
  for (i = 0; i < 128; i++) {
    if (a[i] < -1) {
      f64 t = a[i] + 1;
      acc += t * t - t;
    }
  }
}

// 2-way conditional, page 116

f64 a[128], b[128];

void twoway_cond() {
  int i;
  for (i = 0; i < 128; i++) {
    f64 t;
    if (a[i] < 0) {
      t =  -1.0;
    }
    else {
      t = 1.0;
    }
    b[i] = t;
  }
}

// performance example, page 117

#define N 512 

f32 a[N], b[N], c[N];

void divabc(s32 n) {
  s32 i;
#pragma vector always
  for (i = 0; i < n; i++) {
    if (a[i] > 0) {
      a[i] = b[i] / c[i];
    }
  }
}

// cache line split optimizations, page 123

f32 a[40], b[41], c[42], d[43]; 

void cache_line_split() {
  int i;
  for (i = 0; i < 40; i++) {
     a[i] = b[i+1] * c[i+2] * d[i+3];
  }
}

// cache line split performance, page 126

#define N 256 

s32 x[N], y[N+1];

void alignment_perf(int n) {
  int i;
  for (i = 0; i < n; i++) {
     x[i] = (x[i] + y[i+1]) >> 1;
  }
}

// DDOT and SDOT, pages 136-137

f64 ddot(f64 *dx, f64 *dy, s32 n) {
  s32 i;
  f64 d = 0;
  for (i = 0; i < n; i++) { d += dx[i] * dy[i]; }
  return d;
}

f32 sdot(f32 *sx, f32 *sy, s32 n) {
  s32 i;
  f32 s = 0;
  for (i = 0; i < n; i++) { s += sx[i] * sy[i]; }
  return s;
}

// DAXPY and SAXPY, page 142

void daxpy(f64 *dx, f64 *dy, f64 s, s32 n) {
  s32 i;
  for (i = 0; i < n; i++) { 
    dy[i] += s * dx[i]; 
  }
}

void saxpy(f32 *sx, f32 *sy, f32 s, s32 n) {
  s32 i;
  for (i = 0; i < n; i++) { 
    sy[i] += s * sx[i]; 
  }
}

// conversion idiom, page 146

#define N 100

u8  x[N];
s16 y[N];

void conversion_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    y[i] += x[i];
  }
}

// conversion idiom, page 147

#define N 100

f32 a[N];
s8  z[N];

void conversion_idiom2(int n) {
  int i;
  for (i = 0; i < n; i++) {
    a[i] = z[i];
  }
}

// arithmetic idiom, page 147

#define N 100

s16 x[N], y[N], z[N];

void arith_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    x[i] = (y[i] * z[i]) >> 18;
  }
}

// arithmetic idiom, page 148

s16 u[256];
s32 v[256];

void arith_idiom2(int n) {
  int i;
  for (i = 0; i < n; i++) {
    v[i] = u[i] * u[i];
  }
}

// arithmetic idiom, page 149

#define N 100

u8 x[N], y[N];

void arith_idiom3(int n) {
  int i;
  for (i = 0; i < n; i++) {
    x[i] = (x[i] + y[i] + 1) >> 1;
  }
}

// reduction idiom, pages 149-150

#define N 100

u8 x[N], y[N];

s32 red;

void reduction_idiom(int n) {
  int i;
  red = 0;
  for (i = 0; i < n; i++) {
    s32 temp = x[i] - y[i];
    if (temp < 0) temp = -temp;
    red += temp;
  }
}

// saturation idiom, pages 151 and 156

#define N 100

u16 x[N], y[N], z[N];

void sat_idiom1(int n) {
  int i;
  for (i = 0; i < n; i++) {
    s32 t = x[i] + y[i];
    z[i] = (t <= 65535) ? t : 65535;
  }
}

// saturation idiom, page 156

u8  x[256];
s16 y[256];

void sat_idiom2(int n) {
  int i;
  for (i = 0; i < 256; i++) {
    s32 t = y[i];
    if (t < 0) t = 0;
    if (t > 255) t = 255;
    x[i] = t;
  }
}

// saturation idiom, page 157

#define N 100

u8 u[N], v[N];

void sat_idiom3(int n) {
  int i;
  for (i = 0; i < n; i++) {
    s32 x = (u[i] < 200) ? u[i]+55:255;
    if (x > v[i]) v[i] = x;
  }
}

// gzip flavored loop, page 159

#define N 256

u16 head[N];

void sat_idiom4(int n) {
  int i;
  for (i = 0; i < n; i++) {
    u32 m = head[i]; 
    head[i] = (m >= 32768) ? m-32768 : 0;
  }
}  

// search loop, page 161

#define N 32 

float a[N+1];

int search(float x) {
  int i;
  for (i = 0; i < N; i++) {
    if (a[i] == x) {
      return i;
    }
  }
  return -1; /* not found */
}

// loop materialization, page 177

f32 a[4];
f32 b[4];
f32 c[4];

void ordered() {
  a[0] = b[0] * c[0];
  a[1] = b[1] * c[1];
  a[2] = b[2] * c[2];
  a[3] = b[3] * c[3];
} 
  
void unordered() {
  a[0] = b[0] * c[0];
  a[3] = b[3] * c[3];
  a[2] = b[2] * c[2];
  a[1] = b[1] * c[1];
}

// loop materialization and collapsing, page 181

struct { f32 x, y; } a[100];

void reset_structs() {
int i;
for (i = 0; i < 100; i++) {
   a[i].x = 0;
   a[i].y = 0;
}
}

// loop materialization on local variables, page 183

s32 z[4];

void f(void) {
   s32 i, j, k, l;
   z[0] = i+8;
   z[2] = j+4;
   z[1] = k+9;
   z[3] = l+2;
}

// loop materialization and collapsing, page 184

typedef struct { f32 x, y;     } pair;
typedef struct { pair e[2][2]; } matrix;

void add(matrix *a, f32 s) {
  s32 i, j;
  for (i=0;i<2;i++) {
    for (j=0;j<2;j++){
      a->e[i][j].x += s;
      a->e[i][j].y += s;
    }
  }
}

// loop materialization and collapsing, page 189

void m4(f32 *dest, f32 *src, s32 n)
{
  s32 i;
  for (i = 0; i < n; i++) {
       *(dest)   = 0.5f * *(src);
       *(dest+1) = 0.2f * *(src+1);
       *(dest+2) = 0.1f * *(src+2);
       *(dest+3) = 0.4f * *(src+3);
       src  += 4;
       dest += 4;
 }
}

// This example was used at page 195 to demonstrate compiler diagnostics.
// Make sure comments match the line numbers in the file.

/* 01 */ #define N 32
/* 02 */ float a[N], b[N], c[N], d[N];
/* 03 */
/* 04 */ doit() {
/* 05 */  int i;
/* 06 */  for (i = 0; i < N/2; i++) a[i] = i;
/* 07 */  for (i = 1; i < N; i++) {
/* 08 */   b[i] = b[i-1] + 2;  /* data dependence cycle */
/* 09 */   c[i] = 1;
/* 10 */  }
/* 11 */  d[0] = 10;
/* 12 */  d[1] = 10;
/* 13 */  d[2] = 10;
/* 14 */  d[3] = 10;
/* 15 */}

Please note that this page is privately maintained by Aart Bik. Google+ LinkedIn