__device__ float normpdf_cuda(float x, float mu, float sigma) {
  return expf(-(x - mu) * (x - mu) / (2.0f * sigma * sigma));
}

__device__ float normpdf_cuda(float x, float sigma) {
  return expf(-x * x / (2.0f * sigma * sigma));
}

__device__ float smooth_step(float x, float a, float b) {
	if (x < a)
		return 1.0f;
	if (x > b)
		return 0.0f;
	return 1.0f - (x - a) / (b - a);
}

__global__ void first_pass_kernel(PerRayData_store *store_buffer, PerRayData_y *y_buffer) {
	int i = blockIdx.x;
	int j = blockIdx.y;
	__shared__ float3 data[Y][SQRT_SAMPLES];
	__shared__ float weight[Y][SQRT_SAMPLES];

	int launch_index = j * SAMPLE_PIXEL + i;

	int y1c_bin = threadIdx.x;
	int t = threadIdx.y;

	data[y1c_bin][t] = make_float3(0.0f);
	weight[y1c_bin][t] = 0.0f;

	float y1c = ((y1c_bin + 0.5f) / Y - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f);
	for (int k = 0; k < SQRT_SAMPLES; k++) {
		float y1 = ((k + store_buffer[launch_index].dither1) / SQRT_SAMPLES - 0.5f) * aperture_radius * 2.0f;
		float weight_x = normpdf_cuda(y1, y1c, sigma_lens);
		data[y1c_bin][t] += store_buffer[launch_index].data[k][t] * weight_x;
		weight[y1c_bin][t] += weight_x;
	}
	y_buffer[launch_index].data[y1c_bin][t] = data[y1c_bin][t];
	y_buffer[launch_index].weight[y1c_bin][t] = weight[y1c_bin][t];
}

__global__ void second_pass_kernel(PerRayData_store *store_buffer, PerRayData_y *y_buffer, PerRayData_p *p_buffer) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;

	int launch_index = j * SAMPLE_PIXEL + i;

	if (threadIdx.z < C) {
		p_buffer[launch_index].visibility[threadIdx.z] = make_float3(0.0f);
		p_buffer[launch_index].weight[threadIdx.z] = 0.0f;
	}

	__syncthreads();

	int p = threadIdx.z - FILTER_PIXELS_THRESHOLD;
	if (i + p < 0 || i + p >= SAMPLE_PIXEL) return;
	int sample_index = j * SAMPLE_PIXEL + i + p;

	float sigma_x = store_buffer[launch_index].sigma_x;
    if (fabsf(p) > 3.0f * sigma_x) return;

	float primal_shear = store_buffer[launch_index].primal_shear;
	float sigma_x_sample = store_buffer[sample_index].sigma_x;
	float dither = store_buffer[sample_index].dither2;

	float weight = normpdf_cuda(p, sigma_x);
    if (store_buffer[launch_index].depth / store_buffer[sample_index].depth > 1.0f)
        weight *= smooth_step(sigma_x / sigma_x_sample, 1.0f, 1.8f);

	float y1c = primal_shear * p;
	int y1c_bin = (int) ((y1c / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * Y);
	if (y1c_bin < 0 || y1c_bin >= Y) return;

	for (int yy2 = 0; yy2 < SQRT_SAMPLES; yy2++) {
		float y2 = ((yy2 + dither) / SQRT_SAMPLES - 0.5f) * aperture_radius * 2.0f;
		int y2_bin = (int) ((y2 / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * C);

		atomicAdd(&p_buffer[launch_index].visibility[y2_bin].x, y_buffer[sample_index].data[y1c_bin][yy2].x * weight);
		atomicAdd(&p_buffer[launch_index].visibility[y2_bin].y, y_buffer[sample_index].data[y1c_bin][yy2].y * weight);
		atomicAdd(&p_buffer[launch_index].visibility[y2_bin].z, y_buffer[sample_index].data[y1c_bin][yy2].z * weight);
		atomicAdd(&p_buffer[launch_index].weight[y2_bin], y_buffer[sample_index].weight[y1c_bin][yy2] * weight);
	}
}

__global__ void third_pass_kernel(PerRayData_p *p_buffer) {
	__shared__ float3 total_visibility[2][2];
	__shared__ float total_weight[2][2];

	int launch_index = (blockIdx.y * 2 + threadIdx.y) * SAMPLE_PIXEL + (blockIdx.x * 2 + threadIdx.x);
	int k = (int)threadIdx.z;

	float y2c = ((k + 0.5f) / C - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f);

	total_visibility[threadIdx.x][threadIdx.y] = make_float3(0.0f);
	total_weight[threadIdx.x][threadIdx.y] = 0.0f;

	for (int t = 0; t < C; t++) {
		float y2 = ((t + 0.5f) / C - 0.5f) * (aperture_radius * 2.0f + sigma_lens * 4.0f);
		total_visibility[threadIdx.x][threadIdx.y] += p_buffer[launch_index].visibility[t] * normpdf_cuda(y2, y2c, sigma_lens);
		total_weight[threadIdx.x][threadIdx.y] += p_buffer[launch_index].weight[t] * normpdf_cuda(y2, y2c, sigma_lens);
	}
	p_buffer[launch_index].visibility_conv[k] = total_visibility[threadIdx.x][threadIdx.y];
	p_buffer[launch_index].weight_conv[k] = total_weight[threadIdx.x][threadIdx.y];
}

__global__ void fourth_pass_kernel(PerRayData_store *store_buffer, PerRayData_p *p_buffer) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;

	int launch_index = j * SAMPLE_PIXEL + i;

	if (threadIdx.z == 0)
		store_buffer[launch_index].temp = make_float4(0.0f);

	__syncthreads();

	int q = threadIdx.z - FILTER_PIXELS_THRESHOLD;
	if (j + q < 0 || j + q >= SAMPLE_PIXEL) return;
	int sample_index = (j + q) * SAMPLE_PIXEL + i;

	float sigma_x = store_buffer[launch_index].sigma_x;
	float sigma_x_sample = store_buffer[sample_index].sigma_x;
	float primal_shear = store_buffer[launch_index].primal_shear;
    if (fabsf(q) > 3.0f * sigma_x) return;

	float dist = q;
	float weight = normpdf_cuda(dist, sigma_x);
    if (store_buffer[launch_index].depth / store_buffer[sample_index].depth > 1.0f)
        weight *= smooth_step(sigma_x / sigma_x_sample, 1.0f, 1.8f);
	float y2c = primal_shear * dist;
	int y2c_bin = (int) ((y2c / (aperture_radius * 2.0f + sigma_lens * 4.0f) + 0.5f) * C);
	if (y2c_bin < 0 || y2c_bin >= C) return;

	atomicAdd(&store_buffer[launch_index].temp.x, p_buffer[sample_index].visibility_conv[y2c_bin].x * weight);
	atomicAdd(&store_buffer[launch_index].temp.y, p_buffer[sample_index].visibility_conv[y2c_bin].y * weight);
	atomicAdd(&store_buffer[launch_index].temp.z, p_buffer[sample_index].visibility_conv[y2c_bin].z * weight);
	atomicAdd(&store_buffer[launch_index].temp.w, p_buffer[sample_index].weight_conv[y2c_bin] * weight);
}

extern "C" __host__ void call_cuda_first_pass(void *store_buffer_gpu, void *y_buffer_gpu) {
	dim3 blockSize(SAMPLE_PIXEL, SAMPLE_PIXEL);
	dim3 threadSize(Y, SQRT_SAMPLES);

	first_pass_kernel<<<blockSize, threadSize>>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_y*) y_buffer_gpu);

	cudaDeviceSynchronize();
}

extern "C" __host__ void call_cuda_second_pass(void *store_buffer_gpu, void *y_buffer_gpu, void *p_buffer_gpu) {
	dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2);
	dim3 threadSize(2, 2, 2 * FILTER_PIXELS_THRESHOLD);
	second_pass_kernel<<<blockSize, threadSize>>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_y*) y_buffer_gpu, (PerRayData_p*) p_buffer_gpu);

	cudaDeviceSynchronize();
}

extern "C" __host__ void call_cuda_third_pass(void *p_buffer_gpu) {
	dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2);
	dim3 threadSize(2, 2, C);

	third_pass_kernel<<<blockSize, threadSize>>> ((PerRayData_p*) p_buffer_gpu);

	cudaDeviceSynchronize();
}

extern "C" __host__ void call_cuda_fourth_pass(void *store_buffer_gpu, void *p_buffer_gpu) {
	dim3 blockSize(SAMPLE_PIXEL / 2, SAMPLE_PIXEL / 2);
	dim3 threadSize(2, 2, 2 * FILTER_PIXELS_THRESHOLD);
	fourth_pass_kernel<<<blockSize, threadSize>>> ((PerRayData_store*) store_buffer_gpu, (PerRayData_p*) p_buffer_gpu);

	cudaDeviceSynchronize();
}