将整个矩阵分解为这样的小块,每次完成一对小块的计算,以提高Cache的命中率。
提示: 图中n=N/m计算次序为A11*B11, A11*B12,…, A11*B1n,,由于反复使用A11,因此可以提高Cache的命中率。/*矩阵分开计算C=A*B --- C(i,j)等于A的第i行乘以第j列*/#include#include #include #include #include /* 生成n*n矩阵*/void GenerateMatrix(float *m, int n);void PrintMatrix(float *p, int n);void GeneralMul(float *A, float *B, float *C, int n);void ClearMatrix(float *m, int n);/* 矩阵分块计算*/void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m);/* 两个矩阵的误差*/float diff(float *C1, float *C0, int n);struct ARG { float *A; int ax, ay; float *B; int bx, by; float *C; int cx, cy; int m; int n;};int main(int argc, char **argv){ if (argc != 4) { printf("Usage: %s N thread_num M\n", argv[0]); return 0; } int n=atoi(argv[1]); int thread_num = atoi(argv[2]); int m = atoi(argv[3]); float *A = new float[n*n]; float *B = new float[n*n]; float *C = new float[n*n]; float *C0 = new float[n*n]; GenerateMatrix(A, n); GenerateMatrix(B, n); clock_t start; float time_used; ClearMatrix(C0, n); start=clock(); GeneralMul(A, B, C0, n); time_used = static_cast (clock() - start)/CLOCKS_PER_SEC*1000; printf("General: time = %f\n", time_used); ClearMatrix(C, n); start=clock(); BlockCacul(A, B, C, n, thread_num, m); time_used = static_cast (clock() - start)/CLOCKS_PER_SEC*1000; printf("Block: time = %f\n", time_used); printf("Difference of two result: %f\n", diff(C0, C, n)); delete [] A; delete [] B; delete [] C; delete [] C0; return 0;}void ClearMatrix(float *m, int n){ for (int i=0; i A; float *B = p->B; float *C = p->C; int m = p->m; int n = p->n; for (int i=0; i cx)*n+p->cy+j; for (int k=0; k ax+i)*n+p->ay+k]*B[(p->bx+k)*n+p->by+j]; } } } return 0;}void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m){ //m = static_cast (sqrt(m)); struct ARG *args = new struct ARG[thread_num]; HANDLE *handle = new HANDLE[thread_num]; int t=0; int i; for (i=0; i (rand())/ (static_cast (rand())+ static_cast (0.55)); p++; }}float diff(float *C1, float *C0, int n){ float rst=0.0; float t; for (int i=0; i