| Home | CV | Publications | PhD | PostDoc | SIMD |
| Android | Glass | Astronomy | Retro | Games | Lego |
This page contains supplemental material for the following book.
/* Code Samples from the Software Vectorization Handbook * By Aart J.C. Bik, (C) 2004. * https://www.aartbik.com/ * * This software is *not* in the public domain. However, it is freely available * without any fee for education, research, and non-profit purposes. */ // shorthand type definitions typedef unsigned char u8; typedef signed char s8; typedef unsigned short u16; typedef signed short s16; typedef unsigned int u32; typedef signed int s32; typedef unsigned long long u64; typedef signed long long s64; typedef float f32; typedef double f64;
// saturation, page 14
#define N 128
static u16 x[N], y[N], z[N];
void sat(int n) {
int i;
for (i = 0; i < n; i++) {
int t = x[i] + y[i];
z[i] = (t <= 65535) ? t : 65535;
}
}
// floating-point precision, pages 75-76
#define N 128
float a[N];
void addSP() {
int i;
for (i = 0; i < 128; i++) {
a[i] = a[i] + 3.14159f;
}
}
void addDP() {
int i;
for (i = 0; i < 128; i++) {
a[i] = a[i] + 3.14159;
}
}
// framework, page 78
#define N 200
u8 x[N];
void setx(s32 n) {
s32 i;
for (i = 0; i < n; i++) x[i] = 0;
}
void setx_where_n_is_16() {
s32 i;
for (i = 0; i < 16; i++) x[i] = 0;
}
// mixed-type loop, page 79
#define N 200
s8 x[N];
f64 a[N], b[N];
void mixed() {
int i;
for (i = 0; i < N; i++) {
a[i] += 4.0;
x[i] &= 0x0f;
b[i] *= 2.0;
}
}
// aligned and unaligned access patterns, page 81
u16 x[65];
void alignment() {
int i;
for (i = 0; i < 64; i++) {
x[i] = x[i+1];
}
}
// rotating read-only memory references, pages 81-82
u16 a[64];
static const u16 primes[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
void rotate1() {
int i;
for (i = 0; i < 64; i++) {
a[i] = primes[ (i+2) % 4 ];
}
}
void rotate2() {
int i;
for (i = 0; i < 64; i++) {
a[i] = primes[ i % 8 ];
}
}
// induction (T = u32, u8), pages 86-87
#define N 128
#define T u32
T v[N];
void induction(int n) {
int i;
u8 x = 253;
for (i = 0; i < N; i++) {
v[i] = (T) x;
x++;
}
}
// wide induction variable, page 87
s64 w[100];
void wide_induction(s64 x) {
int i;
for (i = 0; i < 100; i++) {
w[i] = x;
x += 4;
}
}
// floating-point induction, page 88
f64 a[100], b;
void inducDP(int n) {
int i;
b = 3.0;
for (i = 0; i < 100; i++) {
a[i] = b;
b += 1.5;
}
}
// mixed-type induction, page 89
f32 c[100];
void mixed_induction() {
int i;
s32 g = 0x7fffff00;
for (i = 0; i < 100; i++) {
c[i] = (f32) g;
g += 1;
}
}
// direct floating-point induction, page 89
f64 a[100];
void direct_inducFP() {
int i;
for (i = 0; i < 100; i++) {
a[i] = (f64) i;
}
}
// bitwise-AND reduction, page 92
u32 msk[256];
u32 x;
void mask() {
int i;
for (i = 0; i < 256; i++) {
x &= msk[i];
}
}
// scalar variable, pages 94-95
s32 x[64], y[64], z[64], s;
void scalar() {
int i;
for (i = 0; i < 64; i++) {
s = x[i];
y[i] = s - 1;
z[i] = s + 17;
}
}
// multiplication, page 98
s16 u[256], v[256], w[256];
void mult() {
int i;
for (i = 0; i < 256; i++) {
u[i] = v[i] * w[i];
}
}
// division by 32, page 98
u64 x[256];
void div32() {
int i;
for (i = 0; i < 256; i++) {
x[i] /= 32;
}
}
// min recognition, page 101
u8 u[256];
u8 min() {
int i;
u8 x = 0xff;
for (i = 0; i < 256; i++) {
if (u[i] < x) x = u[i];
}
return x;
}
// type conversions, page 102
s32 u[256], v[256];
u32 w[256];
void shifts() {
int i;
for (i = 0; i < 256; i++) {
u[i] = (v[i] >> 2) + (w[i] >> 2);
}
}
// contrived type conversions that should vectorize, page 103
#define N 128
u8 x[N], y[N];
void contrived() {
int i;
for (i = 0; i < N; i++) {
x[i] = ((u8) ((s32)((s8)((s32) y[i]))) );
}
}
// type conversion, page 103
s16 x[128];
void conversion() {
int i;
for (i = 0; i < 128; i++) {
x[i] = ((u8) x[i]);
}
}
// floating-point type conversion, page 104
f32 a[128];
f64 b[128];
f64 c;
void conversionFP() {
int i;
for (i = 0; i < 128; i++) {
a[i] += b[i] * c;
}
}
// float2double accumulation, page 105
f32 a[128];
f64 accumulate(void) {
int i;
f64 acc = 0;
for (i = 0; i < 128; i++) {
acc += a[i];
}
return acc;
}
// mixed-type conversion, page 106
f32 a[100], b[100];
s32 x[100];
void mixed_mult(int p) {
int i;
for (i = 0; i < 100; i++) {
a[i] = b[i] * x[i];
}
}
// math vectorization with SVML, page 108
f64 a[256], b[256], c[256];
void svml() {
int i;
for (i = 0; i < 256; i++) {
a[i] = pow(a[i], b[i]) + cos(c[i]);
}
}
// math vectorization with SVML, page 108
f32 a[256], b[256];
void sin_and_add() {
int i;
for (i = 0; i < 256; i++) {
a[i] = sin( b[i] ) + 1;
}
}
// bit-mask vectorization, page 111
// (pragma may be required to override efficiency heuristics)
#define N 100
s32 x[N], y[N];
s32 z[N];
void masked_if1() {
int i;
#pragma vector always
for (i = 0; i < N; i++) {
if (x[i] == y[i])
z[i] = 0;
}
}
// bit-mask vectorization, page 113
s16 u[800];
void masked_if2() {
int i;
for (i = 0; i < 800; i++) {
if (u[i] < 10)
u[i] = 0;
else
u[i] = 1;
}
}
// guarded error conditions, page 114
#define N1 99
#define N2 100
#define N 300
s32 x[N1], y[N2], *p = NULL;
void guarded_error(int flag) {
int i;
if (flag) {
p = malloc(sizeof(s32) * N);
}
for (i = 0; i < N; i++) {
if (i < N1) x[i] = 0;
if (i < N2) y[i] = 0;
if (flag) p[i] = 0;
}
}
// conditional scalar assignment, page 115
f64 a[128];
f64 acc = 0;
void cond_scalar() {
int i;
acc = 0;
for (i = 0; i < 128; i++) {
if (a[i] < -1) {
f64 t = a[i] + 1;
acc += t * t - t;
}
}
}
// 2-way conditional, page 116
f64 a[128], b[128];
void twoway_cond() {
int i;
for (i = 0; i < 128; i++) {
f64 t;
if (a[i] < 0) {
t = -1.0;
}
else {
t = 1.0;
}
b[i] = t;
}
}
// performance example, page 117
#define N 512
f32 a[N], b[N], c[N];
void divabc(s32 n) {
s32 i;
#pragma vector always
for (i = 0; i < n; i++) {
if (a[i] > 0) {
a[i] = b[i] / c[i];
}
}
}
// cache line split optimizations, page 123
f32 a[40], b[41], c[42], d[43];
void cache_line_split() {
int i;
for (i = 0; i < 40; i++) {
a[i] = b[i+1] * c[i+2] * d[i+3];
}
}
// cache line split performance, page 126
#define N 256
s32 x[N], y[N+1];
void alignment_perf(int n) {
int i;
for (i = 0; i < n; i++) {
x[i] = (x[i] + y[i+1]) >> 1;
}
}
// DDOT and SDOT, pages 136-137
f64 ddot(f64 *dx, f64 *dy, s32 n) {
s32 i;
f64 d = 0;
for (i = 0; i < n; i++) { d += dx[i] * dy[i]; }
return d;
}
f32 sdot(f32 *sx, f32 *sy, s32 n) {
s32 i;
f32 s = 0;
for (i = 0; i < n; i++) { s += sx[i] * sy[i]; }
return s;
}
// DAXPY and SAXPY, page 142
void daxpy(f64 *dx, f64 *dy, f64 s, s32 n) {
s32 i;
for (i = 0; i < n; i++) {
dy[i] += s * dx[i];
}
}
void saxpy(f32 *sx, f32 *sy, f32 s, s32 n) {
s32 i;
for (i = 0; i < n; i++) {
sy[i] += s * sx[i];
}
}
// conversion idiom, page 146
#define N 100
u8 x[N];
s16 y[N];
void conversion_idiom1(int n) {
int i;
for (i = 0; i < n; i++) {
y[i] += x[i];
}
}
// conversion idiom, page 147
#define N 100
f32 a[N];
s8 z[N];
void conversion_idiom2(int n) {
int i;
for (i = 0; i < n; i++) {
a[i] = z[i];
}
}
// arithmetic idiom, page 147
#define N 100
s16 x[N], y[N], z[N];
void arith_idiom1(int n) {
int i;
for (i = 0; i < n; i++) {
x[i] = (y[i] * z[i]) >> 18;
}
}
// arithmetic idiom, page 148
s16 u[256];
s32 v[256];
void arith_idiom2(int n) {
int i;
for (i = 0; i < n; i++) {
v[i] = u[i] * u[i];
}
}
// arithmetic idiom, page 149
#define N 100
u8 x[N], y[N];
void arith_idiom3(int n) {
int i;
for (i = 0; i < n; i++) {
x[i] = (x[i] + y[i] + 1) >> 1;
}
}
// reduction idiom, pages 149-150
#define N 100
u8 x[N], y[N];
s32 red;
void reduction_idiom(int n) {
int i;
red = 0;
for (i = 0; i < n; i++) {
s32 temp = x[i] - y[i];
if (temp < 0) temp = -temp;
red += temp;
}
}
// saturation idiom, pages 151 and 156
#define N 100
u16 x[N], y[N], z[N];
void sat_idiom1(int n) {
int i;
for (i = 0; i < n; i++) {
s32 t = x[i] + y[i];
z[i] = (t <= 65535) ? t : 65535;
}
}
// saturation idiom, page 156
u8 x[256];
s16 y[256];
void sat_idiom2(int n) {
int i;
for (i = 0; i < 256; i++) {
s32 t = y[i];
if (t < 0) t = 0;
if (t > 255) t = 255;
x[i] = t;
}
}
// saturation idiom, page 157
#define N 100
u8 u[N], v[N];
void sat_idiom3(int n) {
int i;
for (i = 0; i < n; i++) {
s32 x = (u[i] < 200) ? u[i]+55:255;
if (x > v[i]) v[i] = x;
}
}
// gzip flavored loop, page 159
#define N 256
u16 head[N];
void sat_idiom4(int n) {
int i;
for (i = 0; i < n; i++) {
u32 m = head[i];
head[i] = (m >= 32768) ? m-32768 : 0;
}
}
// search loop, page 161
#define N 32
float a[N+1];
int search(float x) {
int i;
for (i = 0; i < N; i++) {
if (a[i] == x) {
return i;
}
}
return -1; /* not found */
}
// loop materialization, page 177
f32 a[4];
f32 b[4];
f32 c[4];
void ordered() {
a[0] = b[0] * c[0];
a[1] = b[1] * c[1];
a[2] = b[2] * c[2];
a[3] = b[3] * c[3];
}
void unordered() {
a[0] = b[0] * c[0];
a[3] = b[3] * c[3];
a[2] = b[2] * c[2];
a[1] = b[1] * c[1];
}
// loop materialization and collapsing, page 181
struct { f32 x, y; } a[100];
void reset_structs() {
int i;
for (i = 0; i < 100; i++) {
a[i].x = 0;
a[i].y = 0;
}
}
// loop materialization on local variables, page 183
s32 z[4];
void f(void) {
s32 i, j, k, l;
z[0] = i+8;
z[2] = j+4;
z[1] = k+9;
z[3] = l+2;
}
// loop materialization and collapsing, page 184
typedef struct { f32 x, y; } pair;
typedef struct { pair e[2][2]; } matrix;
void add(matrix *a, f32 s) {
s32 i, j;
for (i=0;i<2;i++) {
for (j=0;j<2;j++){
a->e[i][j].x += s;
a->e[i][j].y += s;
}
}
}
// loop materialization and collapsing, page 189
void m4(f32 *dest, f32 *src, s32 n)
{
s32 i;
for (i = 0; i < n; i++) {
*(dest) = 0.5f * *(src);
*(dest+1) = 0.2f * *(src+1);
*(dest+2) = 0.1f * *(src+2);
*(dest+3) = 0.4f * *(src+3);
src += 4;
dest += 4;
}
}
// This example was used at page 195 to demonstrate compiler diagnostics.
// Make sure comments match the line numbers in the file.
/* 01 */ #define N 32
/* 02 */ float a[N], b[N], c[N], d[N];
/* 03 */
/* 04 */ doit() {
/* 05 */ int i;
/* 06 */ for (i = 0; i < N/2; i++) a[i] = i;
/* 07 */ for (i = 1; i < N; i++) {
/* 08 */ b[i] = b[i-1] + 2; /* data dependence cycle */
/* 09 */ c[i] = 1;
/* 10 */ }
/* 11 */ d[0] = 10;
/* 12 */ d[1] = 10;
/* 13 */ d[2] = 10;
/* 14 */ d[3] = 10;
/* 15 */}