/* Copyright (C) 2010 The Trustees of Indiana University. */ /* */ /* Use, modification and distribution is subject to the Boost Software */ /* License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at */ /* http://www.boost.org/LICENSE_1_0.txt) */ /* */ /* Authors: Jeremiah Willcock */ /* Andrew Lumsdaine */ #ifndef __STDC_CONSTANT_MACROS #define __STDC_CONSTANT_MACROS #endif #include "splittable_mrg.h" #include "graph_generator.h" #include "permutation_gen.h" #include "utils.h" #include #include #include #include #include #ifdef __MTA__ #include #endif #ifdef GRAPH_GENERATOR_MPI #include #endif #ifdef GRAPH_GENERATOR_OMP #include #endif typedef struct slot_data { int64_t index, value; } slot_data; /* This code defines a simple closed-indexing hash table. It is used to speed * up the rand_sort algorithm given below. Elements with -1 as index are * unused; others are used. */ #ifdef __MTA__ #pragma mta inline #endif static inline void hashtable_insert(slot_data* ht, int64_t ht_size, int64_t index, int64_t value, int64_t hashval) { int64_t i; for (i = hashval; i < ht_size; ++i) { if (int64_t_cas(&ht[i].index, (int64_t)(-1), index)) { ht[i].value = value; return; } } for (i = 0; i < hashval; ++i) { if (int64_t_cas(&ht[i].index, (int64_t)(-1), index)) { ht[i].value = value; return; } } assert (!"Should not happen: overflow in hash table"); } #ifdef __MTA__ #pragma mta inline #endif static inline int hashtable_count_key(const slot_data* ht, int64_t ht_size, int64_t index, int64_t hashval) { int c = 0; int64_t i; for (i = hashval; i < ht_size && ht[i].index != (int64_t)(-1); ++i) { if (ht[i].index == index) ++c; } if (i == ht_size) { for (i = 0; i < hashval && ht[i].index != (int64_t)(-1); ++i) { if (ht[i].index == index) ++c; } } return c; } /* Return all values with the given index value into result array; return value * of function is element count. */ #ifdef __MTA__ #pragma mta inline #endif static inline int hashtable_get_values(const slot_data* ht, int64_t ht_size, int64_t index, int64_t hashval, int64_t* result) { int x = 0; int64_t i; for (i = hashval; i < ht_size && ht[i].index != (int64_t)(-1); ++i) { if (ht[i].index == index) { result[x++] = ht[i].value; } } if (i == ht_size) { for (i = 0; i < hashval && ht[i].index != (int64_t)(-1); ++i) { if (ht[i].index == index) { result[x++] = ht[i].value; } } } return x; } #ifdef __MTA__ #pragma mta inline #endif static inline void selection_sort(int64_t* a, int64_t n) { int64_t i, j; if (n <= 1) return; for (i = 0; i + 1 < n; ++i) { int64_t minpos = i; for (j = i + 1; j < n; ++j) { if (a[j] < a[minpos]) minpos = j; } if (minpos != i) { int64_t t = a[minpos]; a[minpos] = a[i]; a[i] = t; } } } /* Fisher-Yates shuffle */ #ifdef __MTA__ #pragma mta inline #endif static inline void randomly_permute(int64_t* a, int64_t n, mrg_state* st) { int64_t i, j; if (n <= 1) return; for (i = n - 1; i > 0; --i) { j = random_up_to(st, i + 1); if (i != j) { int64_t t = a[i]; a[i] = a[j]; a[j] = t; } } } /* Exclusive prefix sum on ints; returns sum of overall input array */ static inline int int_prefix_sum(int* out, const int* in, size_t n) { size_t i; if (n == 0) return 0; out[0] = 0; for (i = 1; i < n; ++i) out[i] = out[i - 1] + in[i - 1]; return out[n - 1] + in[n - 1]; } /* A variant of the rand_sort algorithm from Cong and Bader ("An Empirical * Analysis of Parallel Random Permutation Algorithms on SMPs", Georgia Tech TR * GT-CSE-06-06.pdf, * ). * Sorting here is done using a hash table to effectively act as a bucket sort. * The rand_sort algorithm was chosen instead of the other algorithms in order * to get reproducibility across architectures and processor counts. That is * also the reason for the extra sort immediately before scrambling all * elements with the same key, as well as the expensive PRNG operations. */ /* This version is for sequential machines, OpenMP, and the XMT. */ void rand_sort_shared(mrg_state* st, int64_t n, int64_t* result /* Array of size n */) { int64_t hash_table_size = 2 * n + 128; /* Must be >n, preferably larger for performance */ slot_data* ht = (slot_data*)xmalloc(hash_table_size * sizeof(slot_data)); int64_t i; #ifdef __MTA__ #pragma mta block schedule #endif #ifdef GRAPH_GENERATOR_OMP #pragma omp parallel for #endif for (i = 0; i < hash_table_size; ++i) ht[i].index = (int64_t)(-1); /* Unused */ #ifdef __MTA__ #pragma mta assert parallel #pragma mta block schedule #endif #ifdef GRAPH_GENERATOR_OMP #pragma omp parallel for #endif /* Put elements into the hash table with random keys. */ for (i = 0; i < n; ++i) { mrg_state new_st = *st; mrg_skip(&new_st, 1, i, 0); int64_t index = (int64_t)random_up_to(&new_st, hash_table_size); hashtable_insert(ht, hash_table_size, index, i, index); } /* Count elements with each key in order to sort them by key. */ int64_t* bucket_counts = (int64_t*)xcalloc(hash_table_size, sizeof(int64_t)); /* Uses zero-initialization */ #ifdef __MTA__ #pragma mta assert parallel #pragma mta block schedule #endif #ifdef GRAPH_GENERATOR_OMP #pragma omp parallel for #endif for (i = 0; i < hash_table_size; ++i) { /* Count all elements with same index. */ bucket_counts[i] = hashtable_count_key(ht, hash_table_size, i, i); } /* bucket_counts replaced by its prefix sum (start of each bucket in output array) */ int64_t* bucket_starts_in_result = bucket_counts; int64_t running_sum = 0; #ifdef __MTA__ #pragma mta block schedule #endif /* FIXME: parallelize this on OpenMP */ for (i = 0; i < hash_table_size; ++i) { int64_t old_running_sum = running_sum; running_sum += bucket_counts[i]; bucket_counts[i] = old_running_sum; } assert (running_sum == n); bucket_counts = NULL; #ifdef __MTA__ #pragma mta assert parallel #pragma mta block schedule #endif #ifdef GRAPH_GENERATOR_OMP #pragma omp parallel for #endif for (i = 0; i < hash_table_size; ++i) { int64_t result_start_idx = bucket_starts_in_result[i]; int64_t* temp = result + result_start_idx; /* Gather up all elements with same key. */ int64_t bi = (int64_t)hashtable_get_values(ht, hash_table_size, i, i, temp); if (bi > 1) { /* Selection sort them (for consistency in parallel implementations). */ selection_sort(temp, bi); /* Randomly permute them. */ mrg_state new_st = *st; mrg_skip(&new_st, 1, i, 100); randomly_permute(temp, bi, &new_st); } } free(ht); ht = NULL; free(bucket_starts_in_result); bucket_starts_in_result = NULL; } #ifdef GRAPH_GENERATOR_MPI void rand_sort_mpi(MPI_Comm comm, mrg_state* st, int64_t n, int64_t* result_size_ptr, int64_t** result_ptr /* Allocated using xmalloc() by rand_sort_mpi */) { int size, rank; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); /* Make MPI data type for slot_data. */ MPI_Datatype slot_data_type; { int blocklens[] = {1, 1}; MPI_Aint temp_base, indices[2]; slot_data temp; MPI_Get_address(&temp, &temp_base); MPI_Get_address(&temp.index, &indices[0]); MPI_Get_address(&temp.value, &indices[1]); indices[0] -= temp_base; indices[1] -= temp_base; MPI_Datatype old_types[] = {INT64_T_MPI_TYPE, INT64_T_MPI_TYPE}; MPI_Type_struct(2, blocklens, indices, old_types, &slot_data_type); MPI_Type_commit(&slot_data_type); } int64_t total_hash_table_size = 2 * n + 128; /* Must be >n, preferably larger for performance */ /* Hash table is distributed by blocks: first (total_hash_table_size % size) * are of size (total_hash_table_size / size + 1), rest are of size * (total_hash_table_size / size). This distribution is necessary so that * the permutation can easily be assembled at the end of the function. */ int64_t ht_base_block_size = total_hash_table_size / size; int ht_block_size_cutoff_rank = total_hash_table_size % size; int64_t ht_block_size_cutoff_index = ht_block_size_cutoff_rank * (ht_base_block_size + 1); int64_t ht_my_size = ht_base_block_size + (rank < ht_block_size_cutoff_rank); int64_t ht_my_start = (rank < ht_block_size_cutoff_rank) ? rank * (ht_base_block_size + 1) : ht_block_size_cutoff_index + (rank - ht_block_size_cutoff_rank) * ht_base_block_size; int64_t ht_my_end = ht_my_start + ht_my_size; #define HT_OWNER(e) \ (((e) < ht_block_size_cutoff_index) ? \ (e) / (ht_base_block_size + 1) : \ ht_block_size_cutoff_rank + ((e) - ht_block_size_cutoff_index) / ht_base_block_size) #define HT_LOCAL(e) ((e) - ht_my_start) /* Input elements to scramble are distributed cyclically for simplicity; * their distribution does not matter. */ int64_t elt_my_size = (n / size) + (rank < n % size); int64_t i; /* Cache the key-value pairs to avoid PRNG skip operations. Count the number * of pairs going to each destination processor. */ slot_data* kv_pairs = (slot_data*)xmalloc(elt_my_size * sizeof(slot_data)); int* outcounts = (int*)xcalloc(size, sizeof(int)); /* Relies on zero-init */ for (i = 0; i < elt_my_size; ++i) { mrg_state new_st = *st; mrg_skip(&new_st, 1, i * size + rank, 0); int64_t index = (int64_t)random_up_to(&new_st, total_hash_table_size); int64_t owner = HT_OWNER(index); assert (owner >= 0 && owner < size); ++outcounts[owner]; kv_pairs[i].index = index; kv_pairs[i].value = i * size + rank; } int* outdispls = (int*)xmalloc(size * sizeof(int)); int total_outcount = int_prefix_sum(outdispls, outcounts, size); slot_data* outdata = (slot_data*)xmalloc(total_outcount * sizeof(slot_data)); int* outoffsets = (int*)xmalloc(size * sizeof(int)); memcpy(outoffsets, outdispls, size * sizeof(int)); /* Put the key-value pairs into the output buffer, sorted by destination, to * get ready for MPI_Alltoallv. */ for (i = 0; i < elt_my_size; ++i) { int64_t index = kv_pairs[i].index; int64_t owner = HT_OWNER(index); outdata[outoffsets[owner]] = kv_pairs[i]; ++outoffsets[owner]; } free(kv_pairs); kv_pairs = NULL; for (i = 0; i < size; ++i) { assert (outoffsets[i] == outdispls[i] + outcounts[i]); } free(outoffsets); outoffsets = NULL; int* incounts = (int*)xmalloc(size * sizeof(int)); /* Send data counts. */ MPI_Alltoall(outcounts, 1, MPI_INT, incounts, 1, MPI_INT, comm); int* indispls = (int*)xmalloc(size * sizeof(int)); int total_incount = int_prefix_sum(indispls, incounts, size); slot_data* indata = (slot_data*)xmalloc(total_incount * sizeof(slot_data)); /* Send data to put into hash table. */ MPI_Alltoallv(outdata, outcounts, outdispls, slot_data_type, indata, incounts, indispls, slot_data_type, comm); free(outdata); outdata = NULL; free(outcounts); outcounts = NULL; free(outdispls); outdispls = NULL; free(incounts); incounts = NULL; free(indispls); indispls = NULL; MPI_Type_free(&slot_data_type); /* Create the local part of the hash table. */ slot_data* ht = (slot_data*)xmalloc(ht_my_size * sizeof(slot_data)); for (i = ht_my_start; i < ht_my_end; ++i) { ht[HT_LOCAL(i)].index = (int64_t)(-1); /* Unused */ } for (i = 0; i < total_incount; ++i) { int64_t index = indata[i].index, value = indata[i].value; assert (HT_OWNER(index) == rank); hashtable_insert(ht, ht_my_size, index, value, HT_LOCAL(index)); } free(indata); indata = NULL; /* Make the local part of the result. Most of the rest of this code is * similar to the shared-memory/XMT version above. */ int64_t* result = (int64_t*)xmalloc(total_incount * sizeof(int64_t)); *result_ptr = result; *result_size_ptr = total_incount; int64_t* bucket_counts = (int64_t*)xmalloc(ht_my_size * sizeof(int64_t)); for (i = ht_my_start; i < ht_my_end; ++i) { /* Count all elements with same index. */ bucket_counts[HT_LOCAL(i)] = hashtable_count_key(ht, ht_my_size, i, HT_LOCAL(i)); } /* bucket_counts replaced by its prefix sum (start of each bucket in output array) */ int64_t* bucket_starts_in_result = bucket_counts; int64_t running_sum = 0; for (i = 0; i < ht_my_size; ++i) { int64_t old_running_sum = running_sum; running_sum += bucket_counts[i]; bucket_counts[i] = old_running_sum; } assert (running_sum == total_incount); bucket_counts = NULL; for (i = ht_my_start; i < ht_my_end; ++i) { int64_t result_start_idx = bucket_starts_in_result[HT_LOCAL(i)]; int64_t* temp = result + result_start_idx; /* Gather up all elements with same key. */ int64_t bi = (int64_t)hashtable_get_values(ht, ht_my_size, i, HT_LOCAL(i), temp); if (bi > 1) { /* Selection sort them (for consistency in parallel implementations). */ selection_sort(temp, bi); /* Randomly permute them. */ mrg_state new_st = *st; mrg_skip(&new_st, 1, i, 100); randomly_permute(temp, bi, &new_st); } } free(ht); ht = NULL; free(bucket_starts_in_result); bucket_starts_in_result = NULL; } #undef HT_OWNER #undef HT_LOCAL #endif /* GRAPH_GENERATOR_MPI */ /* Code below this is used for testing the permutation generators. */ #if 0 int main(int argc, char** argv) { MPI_Init(&argc, &argv); const int64_t n = 200000; int64_t* result = NULL; int64_t result_size; mrg_state st; uint_fast32_t seed[5] = {1, 2, 3, 4, 5}; mrg_seed(&st, seed); MPI_Barrier(MPI_COMM_WORLD); double start = MPI_Wtime(); rand_sort_mpi(MPI_COMM_WORLD, &st, n, &result_size, &result); MPI_Barrier(MPI_COMM_WORLD); double time = MPI_Wtime() - start; #if 0 int64_t i; printf("My count = %" PRId64 "\n", result_size); for (i = 0; i < result_size; ++i) printf("%" PRId64 "\n", result[i]); #endif int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank == 0) { printf("Shuffle of %" PRId64 " element(s) took %f second(s).\n", n, time); } free(result); result = NULL; MPI_Finalize(); return 0; } #endif #if 0 int main(int argc, char** argv) { const int64_t n = 5000000; int64_t* result = (int64_t*)xmalloc(n * sizeof(int64_t)); mrg_state st; uint_fast32_t seed[5] = {1, 2, 3, 4, 5}; mrg_seed(&st, seed); unsigned long time; #pragma mta fence time = mta_get_clock(0); rand_sort_shared(&st, n, result); #pragma mta fence time = mta_get_clock(time); #if 0 int64_t i; for (i = 0; i < n; ++i) printf("%" PRId64 "\n", result[i]); #endif printf("Shuffle of %" PRId64 " element(s) took %f second(s).\n", n, time * mta_clock_period()); free(result); result = NULL; return 0; } #endif #if 0 int main(int argc, char** argv) { const int64_t n = 5000000; int64_t* result = (int64_t*)xmalloc(n * sizeof(int64_t)); mrg_state st; uint_fast32_t seed[5] = {1, 2, 3, 4, 5}; mrg_seed(&st, seed); double time; time = omp_get_wtime(); rand_sort_shared(&st, n, result); time = omp_get_wtime() - time; #if 0 int64_t i; for (i = 0; i < n; ++i) printf("%" PRId64 "\n", result[i]); #endif printf("Shuffle of %" PRId64 " element(s) took %f second(s).\n", n, time); free(result); result = NULL; return 0; } #endif