__device__ float normpdf_cuda(float x, float mu, float sigma) { return expf(-(x - mu) * (x - mu) / (2.0f * sigma * sigma)); } __device__ float normpdf_cuda(float x, float sigma) { return expf(-x * x / (2.0f * sigma * sigma)); } __device__ float smooth_step(float x, float a, float b) { if (x < a) return 1.0f; if (x > b) return 0.0f; return 1.0f - (x - a) / (b - a); } __global__ void first_pass_kernel(PerRayData_store *store_buffer, PerRayData_y *y_buffer) { int i = blockIdx.x; int j = blockIdx.y; __shared__ float3 data[Y][SQRT_SAMPLES]; __shared__ float weight[Y][SQRT_SAMPLES]; int launch_index = j * SAMPLE_PIXEL + i; int y1c_bin = threadIdx.x; int t = threadIdx.y; data[y1c_bin][t] = make_float3(0.0f); weight[y1c_bin][t] = 0.0f; float y1c = ((y1c_bin + 0.5f) / Y - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f); for (int k = 0; k < SQRT_SAMPLES; k++) { float y1 = ((k + store_buffer[launch_index].dither1) / SQRT_SAMPLES - 0.5f) * aperture_radius * 2.0f; float weight_x = normpdf_cuda(y1, y1c, sigma_lens); data[y1c_bin][t] += store_buffer[launch_index].data[k][t] * weight_x; weight[y1c_bin][t] += weight_x; } y_buffer[launch_index].data[y1c_bin][t] = data[y1c_bin][t]; y_buffer[launch_index].weight[y1c_bin][t] = weight[y1c_bin][t]; } __global__ void second_pass_kernel(PerRayData_store *store_buffer, PerRayData_y *y_buffer, PerRayData_p *p_buffer) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int launch_index = j * SAMPLE_PIXEL + i; if (threadIdx.z < C) { p_buffer[launch_index].visibility[threadIdx.z] = make_float3(0.0f); p_buffer[launch_index].weight[threadIdx.z] = 0.0f; } __syncthreads(); int p = threadIdx.z - FILTER_PIXELS_THRESHOLD; if (i + p < 0 || i + p >= SAMPLE_PIXEL) return; int sample_index = j * SAMPLE_PIXEL + i + p; float sigma_x = store_buffer[launch_index].sigma_x; if (fabsf(p) > 3.0f * sigma_x) return; float primal_shear = store_buffer[launch_index].primal_shear; float sigma_x_sample = store_buffer[sample_index].sigma_x; float dither = store_buffer[sample_index].dither2; float weight = normpdf_cuda(p, sigma_x); if (store_buffer[launch_index].depth / store_buffer[sample_index].depth > 1.0f) weight *= smooth_step(sigma_x / sigma_x_sample, 1.0f, 1.8f); float y1c = primal_shear * p; int y1c_bin = (int) ((y1c / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * Y); if (y1c_bin < 0 || y1c_bin >= Y) return; for (int yy2 = 0; yy2 < SQRT_SAMPLES; yy2++) { float y2 = ((yy2 + dither) / SQRT_SAMPLES - 0.5f) * aperture_radius * 2.0f; int y2_bin = (int) ((y2 / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * C); atomicAdd(&p_buffer[launch_index].visibility[y2_bin].x, y_buffer[sample_index].data[y1c_bin][yy2].x * weight); atomicAdd(&p_buffer[launch_index].visibility[y2_bin].y, y_buffer[sample_index].data[y1c_bin][yy2].y * weight); atomicAdd(&p_buffer[launch_index].visibility[y2_bin].z, y_buffer[sample_index].data[y1c_bin][yy2].z * weight); atomicAdd(&p_buffer[launch_index].weight[y2_bin], y_buffer[sample_index].weight[y1c_bin][yy2] * weight); } } __global__ void third_pass_kernel(PerRayData_p *p_buffer) { __shared__ float3 total_visibility[2][2]; __shared__ float total_weight[2][2]; int launch_index = (blockIdx.y * 2 + threadIdx.y) * SAMPLE_PIXEL + (blockIdx.x * 2 + threadIdx.x); int k = (int)threadIdx.z; float y2c = ((k + 0.5f) / C - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f); total_visibility[threadIdx.x][threadIdx.y] = make_float3(0.0f); total_weight[threadIdx.x][threadIdx.y] = 0.0f; for (int t = 0; t < C; t++) { float y2 = ((t + 0.5f) / C - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f); total_visibility[threadIdx.x][threadIdx.y] += p_buffer[launch_index].visibility[t] * normpdf_cuda(y2, y2c, sigma_lens); total_weight[threadIdx.x][threadIdx.y] += p_buffer[launch_index].weight[t] * normpdf_cuda(y2, y2c, sigma_lens); } p_buffer[launch_index].visibility_conv[k] = total_visibility[threadIdx.x][threadIdx.y]; p_buffer[launch_index].weight_conv[k] = total_weight[threadIdx.x][threadIdx.y]; } __global__ void fourth_pass_kernel(PerRayData_store *store_buffer, PerRayData_p *p_buffer) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int launch_index = j * SAMPLE_PIXEL + i; if (threadIdx.z == 0) store_buffer[launch_index].temp = make_float4(0.0f); __syncthreads(); int q = threadIdx.z - FILTER_PIXELS_THRESHOLD; if (j + q < 0 || j + q >= SAMPLE_PIXEL) return; int sample_index = (j + q) * SAMPLE_PIXEL + i; float sigma_x = store_buffer[launch_index].sigma_x; float sigma_x_sample = store_buffer[sample_index].sigma_x; float primal_shear = store_buffer[launch_index].primal_shear; if (fabsf(q) > 3.0f * sigma_x) return; float dist = q; float weight = normpdf_cuda(dist, sigma_x); if (store_buffer[launch_index].depth / store_buffer[sample_index].depth > 1.0f) weight *= smooth_step(sigma_x / sigma_x_sample, 1.0f, 1.8f); float y2c = primal_shear * dist; int y2c_bin = (int) ((y2c / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * C); if (y2c_bin < 0 || y2c_bin >= C) return; atomicAdd(&store_buffer[launch_index].temp.x, p_buffer[sample_index].visibility_conv[y2c_bin].x * weight); atomicAdd(&store_buffer[launch_index].temp.y, p_buffer[sample_index].visibility_conv[y2c_bin].y * weight); atomicAdd(&store_buffer[launch_index].temp.z, p_buffer[sample_index].visibility_conv[y2c_bin].z * weight); atomicAdd(&store_buffer[launch_index].temp.w, p_buffer[sample_index].weight_conv[y2c_bin] * weight); } extern "C" __host__ void call_cuda_first_pass(void *store_buffer_gpu, void *y_buffer_gpu) { dim3 blockSize(SAMPLE_PIXEL, SAMPLE_PIXEL); dim3 threadSize(Y, SQRT_SAMPLES); first_pass_kernel<<>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_y*) y_buffer_gpu); cudaDeviceSynchronize(); } extern "C" __host__ void call_cuda_second_pass(void *store_buffer_gpu, void *y_buffer_gpu, void *p_buffer_gpu) { dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2); dim3 threadSize(2, 2, 2 * FILTER_PIXELS_THRESHOLD); second_pass_kernel<<>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_y*) y_buffer_gpu, (PerRayData_p*) p_buffer_gpu); cudaDeviceSynchronize(); } extern "C" __host__ void call_cuda_third_pass(void *p_buffer_gpu) { dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2); dim3 threadSize(2, 2, C); third_pass_kernel<<>> ((PerRayData_p*) p_buffer_gpu); cudaDeviceSynchronize(); } extern "C" __host__ void call_cuda_fourth_pass(void *store_buffer_gpu, void *p_buffer_gpu) { dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2); dim3 threadSize(2, 2, 2 * FILTER_PIXELS_THRESHOLD); fourth_pass_kernel<<>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_p*) p_buffer_gpu); cudaDeviceSynchronize(); }