#include #include #include #include #include #include #include #include #include #include "matrix_multiply.h" /* * Allocates a row-by-cols matrix and returns it */ matrix_t *make_matrix(int rows, int cols) { matrix_t *new_matrix = malloc(sizeof(matrix_t)); new_matrix->rows = rows; new_matrix->cols = cols; new_matrix->colstride = rows; new_matrix->values = (double *) malloc(sizeof(double) * rows * cols); return new_matrix; } /* * Frees an allocated matrix (not a submatrix) */ void free_matrix(matrix_t *m) { free(m->values); free(m); } /* * Print Matrix */ void print_matrix(matrix_t *m) { int i, j; printf("------------\n"); for (i = 0; i < m->rows; i++) { for (j = 0; j < m->cols; j++) { printf(" %g ", element(m,i,j)); } printf("\n"); } printf("------------\n"); } int make_submatrix(matrix_t *new_matrix, matrix_t *A, int start_row, int start_col, int rows, int cols) { assert(start_row+rows <= A->rows); assert(start_col+cols <= A->cols); new_matrix->rows = rows; new_matrix->cols = cols; new_matrix->colstride = A->colstride; new_matrix->values = &element(A, start_row, start_col); return 0; } /** * Multiply matrix A*B, store result in C. * Version 1: Vanilla ijk nested loops. * This can be easily modified for other index orders: ikj, jik, jki, kij, kji. */ int matrix_multiply_run_1(matrix_t *A, matrix_t *B, matrix_t *C) { int i, j, k; for (i = 0; i < A->rows; i++) { for (j = 0; j < B->cols; j++) { for (k = 0; k < A->cols; k++) { element(C,i,j) += element(A,i,k) * element(B,k,j); } } } return 0; } /** * Multiply matrix A*B, store result in C. * Version 2: Use best index order (kji, so inner loop follows cache lines) */ int matrix_multiply_run_2(matrix_t *A, matrix_t *B, matrix_t *C) { int i, j, k; for (k = 0; k < A->cols; k++) { for (j = 0; j < B->cols; j++) { for (i = 0; i < A->rows; i++) { element(C,i,j) += element(A,i,k) * element(B,k,j); } } } return 0; } /** * Multiply matrix A*B, store result in C. * Version 3: * - Increment pointers instead of recomputing addresses * - Stash a reused value in a register * - Unroll the inner loop */ int matrix_multiply_run_3(matrix_t *A, matrix_t *B, matrix_t *C) { int i, j, k; double *aptr, *cptr; register double b; for (k = 0; k < A->cols; k++) { for (j = 0; j < B->cols; j++) { b = element(B,k,j); cptr = &element(C,0,j); aptr = &element(A,0,k); for (i = 0; i <= A->rows-8; i += 8) { *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; *cptr++ += *aptr++ * b; } for (; i < A->rows; i++) { *cptr++ += *aptr++ * b; } } } return 0; } /** * Multiply matrix A*B, store result in C. * Version 4: Blocked recursive. */ int matrix_multiply_run_4(matrix_t *A, matrix_t *B, matrix_t *C, int n_blocks, int min_block) { int ii, jj, kk, b, max_dim; matrix_t AA, BB, CC; max_dim = A->rows; if(max_dim < A->cols) max_dim = A->cols; if(max_dim < B->cols) max_dim = B->cols; if(max_dim <= min_block){ return matrix_multiply_run_3(A, B, C); }else{ b = (max_dim + n_blocks - 1) / n_blocks; for (kk = 0; kk < A->cols; kk += b) { for (jj = 0; jj < B->cols; jj += b) { make_submatrix(&BB, B, kk, jj, min(b, B->rows - kk), min(b, B->cols - jj)); for (ii = 0; ii < A->rows; ii += b) { make_submatrix(&AA, A, ii, kk, min(b, A->rows - ii), min(b, A->cols - kk)); make_submatrix(&CC, C, ii, jj, min(b, C->rows - ii), min(b, C->cols - jj)); matrix_multiply_run_4(&AA, &BB, &CC, n_blocks, min_block); } } } return 0; } } /** * Multiply matrix A*B, store result in C. * Version 5: Call the built in BLAS routine. */ int matrix_multiply_run_5(matrix_t *A, matrix_t *B, matrix_t *C) { cblas_dgemm( CblasColMajor, // const enum CBLAS_ORDER Order, CblasNoTrans, // const enum CBLAS_TRANSPOSE TransA, CblasNoTrans, // const enum CBLAS_TRANSPOSE TransB, A->rows, // const int M, C->cols, // const int N, A->cols, // const int K, 1.0, // const double alpha, A->values, // const double *A, A->colstride, // const int lda, B->values, // const double *B, B->colstride, // const int ldb, 1.0, // const double beta, C->values, // double *C, C->colstride // const int ldc ); return 0; }