Originally Posted by

**gens**
a program that gives wrong results is infinitely slower then any program that does

(...)

unrolled loops also help.. to a point

i got an new AMD now, but i think intel still suffers from half aligned reads/writes (16byte aligned is usually the best)

So this is my "final" C code that does the same number of compiler improvement with no alignment. Using basically: "-O3 -ffast-math -flto" will run faster than the most aligned code I could write (look on the next block of code)

Code:

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <sys/time.h>
unsigned long long int rdtsc(void)
{
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);;
}
float matrices[10000][9];
float vertices[10000][3];
float result[10000][3];
void compute(int count ) {
int i,j,k;
float partial;
float res[3];
for( i=0;i<count;i++) {
for(j = 0; j<3; j++)
{
partial = 0.0f;
for(k = 0; k<3; k++)
partial += vertices[i][k] * matrices[i][j*3+k];
res[j] = partial;
}
memcpy(&result, &res, sizeof(float)*3);
}
}
int main() {
int i;
int count = 10000;
float tmp=0.0f;
for( i=0; i<count*3; i++) {
vertices[i/3][i%3]=tmp;
tmp=tmp+1;
}
tmp = 0.0f;
for( i=0; i<count*9; i++) {
matrices[i/9][i%9]=tmp;
tmp=tmp+1;
}
unsigned long long ts = rdtsc();
compute( count );
printf("elapsed ticks: %llu\n", rdtsc() - ts);
for( i=0; i<24; i++) {
printf("%f ", result[i/3][i%3]);
}
printf("\n");
return 0;
}

2nd version:

Code:

void compute(
float * __restrict matrix,
float * __restrict vertex,
float * __restrict result,
int count ) {
int i,j,k;
float *m = __builtin_assume_aligned(matrix, 16);
float *v = __builtin_assume_aligned(vertex, 16);
float *r = __builtin_assume_aligned(result, 16);
for( i=0;i<count;i++) {
for(j=0;j<3;j++)
{
float accumulator = 0.0f;
for (k=0;k<3;k++)
accumulator+= m[j]*v[k];
result[j] = accumulator;
}
m += 9;
r += 3;
v += 3;
}
}
#include <stdio.h>
#include <sys/time.h>
unsigned long long int rdtsc(void)
{
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);;
}
int main() {
float matrices[100000];
float vertices[100000];
float result[100000];
int i;
int count = 10000;
float *ptrmat, *ptrvert, *ptrres;
float tmp=0;
for( i=0; i<count*3; i++) {
matrices[i]=tmp;
vertices[i]=tmp;
tmp=tmp+1;
}
ptrmat = &matrices[0];
ptrvert = &vertices[0];
ptrres = &result[0];
unsigned long long ts = rdtsc();
compute( ptrmat, ptrvert, ptrres, count );
printf("elapsed ticks: %llu\n", rdtsc() - ts);
for( i=0; i<24; i++) {
printf("%f ", result[i]);
}
printf("\n");
return 0;
}

Changed data structure to store bi-dimensional arrays:

Code:

$ ./a.out
elapsed ticks: 263760

Same data structure (2nd implementation) with alignment hints and merging of the loops

Code:

$ ./a.out
elapsed ticks: 331592

I think that none of the implementation were in fact vectorized, but they run smoking fast and faster (I think, I don't know how to compile the assembly part, using **as** tool!?) than the +30% speedup. Both run more than 100% faster than the original code I was given by you (that they were running in the 700000 ticks zone). So maybe is time for you to improve your C skills to get to speed with fast C code!?