#include"SimpleMOC_header.h"

/* Efficient version of attenuate fluxes which determines the change in angular
 * flux along a particular track across a fine axial region and tallies the 
 * contribution to the scalar flux in the fine axial region. This function
 * assumes a quadratic source, which is calculated on the fly using neighboring
 * source values. 
 * 
 * This version decomposes the work into many for loops for efficient SIMD 
 * instructions and to reduce register pressure. For a more descriptive 
 * (but less effiient) version of the code in terms of the underlying physics, 
 * see alt_attenuate_fluxes which solves the problem in a more naive, 
 * straightforward manner. */
void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I_in,
		Params * params_in, float ds, float mu, float az_weight, 
		AttenuateVars * A ) 
{
	Input I = *I_in;
	Params params = *params_in;

	// unload attenuate vars
	float * restrict q0 = A->q0;
	float *  restrict q1 = A->q1;
	float *  restrict q2 = A->q2;
	float *  restrict sigT = A->sigT;
	float *  restrict tau = A->tau;
	float *  restrict sigT2 = A->sigT2;
	float *  restrict expVal = A->expVal;
	float *  restrict reuse = A->reuse;
	float *  restrict flux_integral = A->flux_integral;
	float *  restrict tally = A->tally;
	float *  restrict t1 = A->t1;
	float *  restrict t2 = A->t2;
	float *  restrict t3 = A->t3;
	float *  restrict t4 = A->t4;

	// compute fine axial interval spacing
	float dz = I.height / (I.fai * I.decomp_assemblies_ax * I.cai);

	// compute z height in cell
	float zin = track->z_height - dz * 
		( (int)( track->z_height / dz ) + 0.5f );

	// compute fine axial region ID
	int fine_id = (int) ( track->z_height / dz ) % I.fai;

	// compute weight (azimuthal * polar)
	// NOTE: real app would also have volume weight component
	float weight = track->p_weight * az_weight;
	float mu2 = mu * mu;

	// load fine source region flux vector
	float * FSR_flux = QSR -> fine_flux[fine_id];

	if( fine_id == 0 )
	{
		// adjust z height to account for edge
		zin -= dz;

		// cycle over energy groups
		#ifdef INTEL
		#pragma simd
		#elif defined IBM
		#pragma simd_level(10)
		#endif
		for( int g = 0; g < I.n_egroups; g++)
		{
			// load neighboring sources
			float y1 = QSR->fine_source[fine_id][g];
			float y2 = QSR->fine_source[fine_id+1][g];
			float y3 = QSR->fine_source[fine_id+2][g];

			// do quadratic "fitting"
			float c0 = y2;
			float c1 = (y1 - y3) / (2.f*dz);
			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
			
			// calculate q0, q1, q2
			q0[g] = c0 + c1*zin + c2*zin*zin;
			q1[g] = c1 + 2.f*c2*zin;
			q2[g] = c2;
		}
	}
	else if ( fine_id == I.fai - 1 )
	{
		// adjust z height to account for edge
		zin += dz;
		
		// cycle over energy groups
		#ifdef INTEL
		#pragma simd
		#elif defined IBM
		#pragma simd_level(10)
		#endif
		for( int g = 0; g < I.n_egroups; g++)
		{
			// load neighboring sources
			float y1 = QSR->fine_source[fine_id-2][g];
			float y2 = QSR->fine_source[fine_id-1][g];
			float y3 = QSR->fine_source[fine_id][g];

			// do quadratic "fitting"
			float c0 = y2;
			float c1 = (y1 - y3) / (2.f*dz);
			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);

			// calculate q0, q1, q2
			q0[g] = c0 + c1*zin + c2*zin*zin;
			q1[g] = c1 + 2.f*c2*zin;
			q2[g] = c2;
		}
	}
	else
	{
		// cycle over energy groups
		#ifdef INTEL
		#pragma simd
		#elif defined IBM
		#pragma simd_level(10)
		#endif
		for( int g = 0; g < I.n_egroups; g++)
		{
			// load neighboring sources
			float y1 = QSR->fine_source[fine_id-1][g];
			float y2 = QSR->fine_source[fine_id][g];
			float y3 = QSR->fine_source[fine_id+1][g];

			// do quadratic "fitting"
			float c0 = y2;
			float c1 = (y1 - y3) / (2.f*dz);
			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);

			// calculate q0, q1, q2
			q0[g] = c0 + c1*zin + c2*zin*zin;
			q1[g] = c1 + 2.f*c2*zin;
			q2[g] = c2;
		}
	}

	// cycle over energy groups
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		// load total cross section
		sigT[g] = QSR->sigT[g];

		// calculate common values for efficiency
		tau[g] = sigT[g] * ds;
		sigT2[g] = sigT[g] * sigT[g];
	}

	// cycle over energy groups
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
		expVal[g] = interpolateTable( params.expTable, tau[g] );  

	// Flux Integral

	// Re-used Term
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		reuse[g] = tau[g] * (tau[g] - 2.f) + 2.f * expVal[g] 
			/ (sigT[g] * sigT2[g]); 
	}


	float * psi;
	if(forward)
		psi = track->f_psi;
	else
		psi = track->b_psi;

	//#pragma vector nontemporal
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		// add contribution to new source flux
		flux_integral[g] = (q0[g] * tau[g] + (sigT[g] * psi[g] - q0[g])
			* expVal[g]) / sigT2[g] + q1[g] * mu * reuse[g] + q2[g] * mu2 
			* (tau[g] * (tau[g] * (tau[g] - 3.f) + 6.f) - 6.f * expVal[g]) 
			/ (3.f * sigT2[g] * sigT2[g]);
	}

	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		// Prepare tally
		tally[g] = weight * flux_integral[g];
	}

	#ifdef OPENMP
	omp_set_lock(QSR->locks + fine_id);
	#endif

	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		FSR_flux[g] += tally[g];
	}

	#ifdef OPENMP
	omp_unset_lock(QSR->locks + fine_id);
	#endif

	// Term 1
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		t1[g] = q0[g] * expVal[g] / sigT[g];  
	}
	// Term 2
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		t2[g] = q1[g] * mu * (tau[g] - expVal[g]) / sigT2[g]; 
	}
	// Term 3
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		t3[g] =	q2[g] * mu2 * reuse[g];
	}
	// Term 4
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		t4[g] = psi[g] * (1.f - expVal[g]);
	}
	// Total psi
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I.n_egroups; g++)
	{
		psi[g] = t1[g] + t2[g] + t3[g] + t4[g];
	}
}	

// single direction transport sweep
void transport_sweep( Params * params, Input * I )
{
	if(I->mype==0) printf("Starting transport sweep ...\n");

	// calculate the height of a node's domain and of each FSR
	double node_delta_z = I->height / I->decomp_assemblies_ax;
	double fine_delta_z = node_delta_z / (I->cai * I->fai);

	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
	 * angles, polar angles, and z stacked rays) */

	//print_Input_struct( I );
	long segments_processed = 0;

	#pragma omp parallel default(none) \
	shared( I, params, node_delta_z, fine_delta_z ) \
	reduction(+ : segments_processed )
	{
		#ifdef OPENMP
		int thread = omp_get_thread_num();
		int nthreads = omp_get_num_threads();
		unsigned int seed = time(NULL) * (thread+1);
		#endif
		//print_Input_struct( I );

		#ifdef PAPI
		int eventset = PAPI_NULL;
		int num_papi_events;
		#pragma omp critical
		{
			counter_init(&eventset, &num_papi_events, I);
		}
		#endif

		AttenuateVars A;
		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
		A.q0 = ptr;
		ptr += I->n_egroups;
		A.q1 = ptr;
		ptr += I->n_egroups;
		A.q2 = ptr;
		ptr += I->n_egroups;
		A.sigT = ptr;
		ptr += I->n_egroups;
		A.tau = ptr;
		ptr += I->n_egroups;
		A.sigT2 = ptr;
		ptr += I->n_egroups;
		A.expVal = ptr;
		ptr += I->n_egroups;
		A.reuse = ptr;
		ptr += I->n_egroups;
		A.flux_integral = ptr;
		ptr += I->n_egroups;
		A.tally = ptr;
		ptr += I->n_egroups;
		A.t1 = ptr;
		ptr += I->n_egroups;
		A.t2 = ptr;
		ptr += I->n_egroups;
		A.t3 = ptr;
		ptr += I->n_egroups;
		A.t4 = ptr;

		#pragma omp for schedule( dynamic ) 
		for (long i = 0; i < I->ntracks_2D; i++)
		{
			#if TIMING_INFO | 0
				// print progress
				#ifdef OPENMP
				if(I->mype==0 && thread == 0)
				{
					printf("\rAttenuating Tracks... (%.0lf%% completed)",
							(i / ( (double)I->ntracks_2D / (double) nthreads ))
							/ (double) nthreads * 100.0);
				}
				#else
				if( i % 50 == 0)
					if(I->mype==0)
						printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
								I->ntracks_2D );
				#endif
			#endif


			// treat positive-z traveling rays first
			bool pos_z_dir = true;
			for( int j = 0; j < I->n_polar_angles; j++)
			{
				if( j == I->n_polar_angles / 2 )
					pos_z_dir = false;
				float p_angle = params->polar_angles[j];
				float mu = cos(p_angle);

				// start with all z stacked rays
				int begin_stacked = 0;
				int end_stacked = I->z_stacked;

				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
				{
					// calculate distance traveled in cell if segment completed
					float s_full = params->tracks_2D[i].segments[n].length 
						/ sin(p_angle);

					// allocate varaible for distance traveled in an FSR
					float ds = 0;

					// loop over remaining z-stacked rays
					for( int k = begin_stacked; k < end_stacked; k++)
					{
						// initialize s to full length
						float s = s_full;

						// select current track
						Track * track = &params->tracks[i][j][k];

						// set flag for completeion of segment
						bool seg_complete = false;

						// calculate interval
						int curr_interval;
						if( pos_z_dir)
							curr_interval = get_pos_interval(track->z_height, 
									fine_delta_z);
						else
							curr_interval = get_neg_interval(track->z_height, 
									fine_delta_z);

						while( !seg_complete )
						{
							// flag to reset z position
							bool reset = false;


							/* calculate new height based on s 
							 * (distance traveled in FSR) */
							float z = track->z_height + s * cos(p_angle);

							// check if still in same FSR (fine axial interval)
							int new_interval;
							if( pos_z_dir )
								new_interval = get_pos_interval(z, 
										fine_delta_z);
							else
								new_interval = get_neg_interval(z,
										fine_delta_z);

							if( new_interval == curr_interval )
							{
								seg_complete = true;
								ds = s;
							}

							// otherwise, we need to recalculate distances
							else
							{
								// correct z
								if( pos_z_dir )
								{
									curr_interval++;
									z = fine_delta_z * (float) curr_interval;
								}
								else{
									curr_interval--;
									z = fine_delta_z * (float) curr_interval;
								}

								// calculate distance travelled in FSR (ds)
								ds = (z - track->z_height) / cos(p_angle);

								// update track length remaining
								s -= ds;

								/* check remaining track length to protect
								 * against potential roundoff errors */
								if( s <= 0 )
									seg_complete = true;

								// check if out of bounds or track complete
								if( z <= 0 || z >= node_delta_z )
								{
									// mark segment as completed
									seg_complete = true;

									// remember to no longer treat this track
									if ( pos_z_dir )
										end_stacked--;
									else
										begin_stacked++;

									// reset z height
									reset = true;
								}
							}

							// pick a random FSR (cache miss expected)
							#ifdef OPENMP
							long QSR_id = rand_r(&seed) % 
								I->n_source_regions_per_node;
							#else
							long QSR_id = rand() % 
								I->n_source_regions_per_node;
							#endif

							/* update sources and fluxes from attenuation 
							 * over FSR */
							if( I->axial_exp == 2 )
							{
								attenuate_fluxes( track, true, 
										&params->sources[QSR_id], 
										I, params, ds, mu, 
										params->tracks_2D[i].az_weight, &A );

								segments_processed++;
							}

							else if( I->axial_exp == 0 )
							{
								attenuate_FSR_fluxes( track, true,
										&params->sources[QSR_id],
										I, params, ds, mu,
										params->tracks_2D[i].az_weight, &A );

								segments_processed++;
							}
							else
							{
								printf("Error: invalid axial expansion order");
								printf("\n Please input 0 or 2\n");
								exit(1);
							}

							// update with new z height or reset if finished
							if( n == params->tracks_2D[i].n_segments - 1  
									|| reset)
							{
								if( pos_z_dir)
									track->z_height = I->axial_z_sep * k;
								else
									track->z_height = I->axial_z_sep * (k+1);
							}
							else
								track->z_height = z;

						}
					}
				}
			}
		}
		#ifdef OPENMP
		if(thread == 0 && I->mype==0) printf("\n");
		#endif

		#ifdef PAPI
		if( thread == 0 )
		{
			printf("\n");
			border_print();
			center_print("PAPI COUNTER RESULTS", 79);
			border_print();
			printf("Count          \tSmybol      \tDescription\n");
		}
		{
			#pragma omp barrier
		}
		counter_stop(&eventset, num_papi_events, I);
		#endif
	}
	I->segments_processed = segments_processed;

	return;
}


// run one full transport sweep, return k
void two_way_transport_sweep( Params * params, Input * I )
{
	if(I->mype==0) printf("Starting transport sweep ...\n");

	// calculate the height of a node's domain and of each FSR
	double node_delta_z = I->height / I->decomp_assemblies_ax;
	int num_intervals = (I->cai * I->fai);
	double fine_delta_z = node_delta_z / num_intervals;

	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
	 * angles, polar angles, and z stacked rays) */
		long segments_processed = 0;

	#pragma omp parallel default(none) \
	shared( I, params, node_delta_z, fine_delta_z, num_intervals ) \
	reduction(+ : segments_processed )
	{
		#ifdef OPENMP
		int thread = omp_get_thread_num();
		int nthreads = omp_get_num_threads();
		unsigned int seed = time(NULL) * (thread+1);
		#endif
		//print_Input_struct( I );

		#ifdef PAPI
		int eventset = PAPI_NULL;
		int num_papi_events;
		#pragma omp critical
		{
			counter_init(&eventset, &num_papi_events, I);
		}
		#endif


		AttenuateVars A;
		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
		A.q0 = ptr;
		ptr += I->n_egroups;
		A.q1 = ptr;
		ptr += I->n_egroups;
		A.q2 = ptr;
		ptr += I->n_egroups;
		A.sigT = ptr;
		ptr += I->n_egroups;
		A.tau = ptr;
		ptr += I->n_egroups;
		A.sigT2 = ptr;
		ptr += I->n_egroups;
		A.expVal = ptr;
		ptr += I->n_egroups;
		A.reuse = ptr;
		ptr += I->n_egroups;
		A.flux_integral = ptr;
		ptr += I->n_egroups;
		A.tally = ptr;
		ptr += I->n_egroups;
		A.t1 = ptr;
		ptr += I->n_egroups;
		A.t2 = ptr;
		ptr += I->n_egroups;
		A.t3 = ptr;
		ptr += I->n_egroups;
		A.t4 = ptr;

		#pragma omp for schedule( dynamic ) 
		for (long i = 0; i < I->ntracks_2D; i++)
		{
			// print progress
			#ifdef OPENMP
			if(I->mype==0 && thread == 0)
			{
				printf("\rAttenuating Tracks... (%.0lf%% completed)",
						(i / ( (double)I->ntracks_2D / (double) nthreads ))
						/ (double) nthreads * 100.0);
			}
			#else
			if( i % 50 == 0)
				if(I->mype==0)
					printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
							I->ntracks_2D );
			#endif

			// allocate arrays for segment storage FIXME
			double ** seg_dist = malloc( I->z_stacked * sizeof(double *) );
			Source *** seg_src = malloc( I->z_stacked * sizeof(Source**) );
			int * seg_idx = malloc( I->z_stacked * sizeof(int) );
			int * seg_size = malloc( I->z_stacked * sizeof(int) );

			// fill matrix with arrays FIXME
			for( int k = 0; k < I->z_stacked; k++)
			{
				seg_size[k] = 2 * I->segments_per_track;
				seg_dist[k] = malloc( seg_size[k] * sizeof(double) );
				seg_src[k] = malloc( seg_size[k] * sizeof(Source *) );
				seg_idx[k] = 0;
			}

			// treat positive-z traveling rays first
			bool pos_z_dir = true;
			for( int j = 0; j < I->n_polar_angles; j++)
			{
				if( j == I->n_polar_angles / 2 )
					pos_z_dir = false;
				float p_angle = params->polar_angles[j];
				float mu = cos(p_angle);

				// start with all z stacked rays
				int begin_stacked = 0;
				int end_stacked = I->z_stacked;

				// reset semgnet indexes
				for( int k = 0; k < I->z_stacked; k++)
					seg_idx[k] = 0;

				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
				{
					// calculate distance traveled in cell if segment completed
					float s_full = params->tracks_2D[i].segments[n].length 
						/ sin(p_angle);

					// allocate varaible for distance traveled in an FSR
					float ds = 0;

					// loop over remaining z-stacked rays
					int tracks_completed = 0;
					for( int k = begin_stacked; k < end_stacked; k++)
					{
						// select current track
						Track * track = &params->tracks[i][j][k];

						// determine current axial interval
						int interval = (int) track->z_height / fine_delta_z;

						// calculate distance to domain boundary
						float bound_dist;
						if( pos_z_dir)
							bound_dist = (node_delta_z - track->z_height) / mu;
						else
							bound_dist = -track->z_height / mu;

						// determine track length
						float s;
						if(	s_full < bound_dist )
							s = s_full;
						else
						{
							// note completion of track
							s = bound_dist;
							tracks_completed++;
						}

						// set flag for completeion of segment
						bool seg_complete = false;

						while( !seg_complete )
						{
							// initialize tracking variables
							long QSR_id = interval + num_intervals * n;
							float ds;
							float z;

							// calculate z height of next fine axial interval
							float fai_z_height;
							if( pos_z_dir )
								fai_z_height = (interval + 1) * fine_delta_z ;
							else
								fai_z_height = interval * fine_delta_z;

							// calculate z distance to next fine axial interval
							float z_dist_to_fai = 
								fai_z_height - track->z_height;

							/* calculate total distance (s) to fine axial 
							 * interval */
							float s_dist_to_fai = z_dist_to_fai / mu;

							// determine if a fine axial interval is crossed
							if( s_dist_to_fai < s )
							{
								if( pos_z_dir )
									interval++;
								else
									interval--;
								ds = s_dist_to_fai;
								z = track->z_height + z_dist_to_fai;
							}
							else
							{
								ds = s;
								z = track->z_height + s * mu;
							}	

							/* shorten remaining segment length and check if
							 * completed (accounting for potential roundoff) */
							s -= ds;
							if( s <= 0 || interval < 0 
									|| interval >= num_intervals)
								seg_complete = true;

							// pick a random FSR (cache miss expected)
							#ifdef OPENMP
							QSR_id = rand_r(&seed) % 
								I->n_source_regions_per_node;
							#else
							QSR_id = rand() % I->n_source_regions_per_node;
							#endif

							/* update sources and fluxes from attenuation 
							 * over FSR */
							if( I->axial_exp == 2 )
							{
								attenuate_fluxes( track, true,
										&params->sources[QSR_id], 
										I, params, ds, mu, 
										params->tracks_2D[i].az_weight, &A );
								segments_processed++;
							}

							else if( I->axial_exp == 0 )
								attenuate_FSR_fluxes( track, true,
										&params->sources[QSR_id],
										I, params, ds, mu,
										params->tracks_2D[i].az_weight, &A );
							else
							{
								printf("Error: invalid axial expansion order");
								printf("\n Please input 0 or 2\n");
								exit(1);
							}

							// update track height
							track->z_height = z;
							
							// save segment length and source FIXME
							seg_dist[k][seg_idx[k]] = ds;
							seg_src[k][seg_idx[k]] = &params->sources[QSR_id];
							seg_idx[k]++;

							// check if array needs to grow FIXME
							if( seg_idx[k] >= seg_size[k] )
							{
								seg_size[k] *= 2;
								seg_dist[k] = (double *) realloc( seg_dist[k],
										seg_size[k] * sizeof(double) );
								seg_src[k] = (Source **) realloc( seg_src[k],
										seg_size[k] * sizeof(Source *) );
							}
						}
					}
					if(pos_z_dir)
						end_stacked -= tracks_completed;
					else
						begin_stacked += tracks_completed;
				}

				// loop over all z stacked rays again
				for( int k = 0; k < I->z_stacked; k++ )
				{
					for( int n = seg_idx[k]-1; n >= 0; n--)
					{
						// load distance
						float ds = seg_dist[k][n];

						// select current track
						Track * track = &params->tracks[i][j][k];

						// update sources and fluxes from attenuation over FSR
						if( I->axial_exp == 2 )
						{
							attenuate_fluxes( track, false,
									seg_src[k][n], 
									I, params, ds, -mu, 
									params->tracks_2D[i].az_weight, &A );
								segments_processed++;
						}

						else if( I->axial_exp == 0 )
							attenuate_FSR_fluxes( track, false,
									seg_src[k][n],
									I, params, ds, -mu,
									params->tracks_2D[i].az_weight, &A );

						// update z height
						track->z_height -= ds * mu;
					}
				}


				/* Update all tracks with correct starting z location again
				 * NOTE: this is only here to acocunt for roundoff error */
				for( int k = 0; k < I->z_stacked; k++)
				{
					Track * track = &params->tracks[i][j][k];
					if( pos_z_dir)
						track->z_height = I->axial_z_sep * k;
					else
						track->z_height = I->axial_z_sep * (k+1);
				}
			}

			// free memory
			for( int k = 0; k < I->z_stacked; k++)
			{
				free(seg_dist[k]);
				free(seg_src[k]);
			}
			free(seg_dist);
			free(seg_src);
			free(seg_idx);
			free(seg_size);

		}
		#ifdef OPENMP
		if(thread == 0 && I->mype==0) printf("\n");
		#endif

		#ifdef PAPI
		if( thread == 0 )
		{
			printf("\n");
			border_print();
			center_print("PAPI COUNTER RESULTS", 79);
			border_print();
			printf("Count          \tSmybol      \tDescription\n");
		}
		{
			#pragma omp barrier
		}
		counter_stop(&eventset, num_papi_events, I);
		#endif
	}
	//printf("Number of segments processed: %ld\n", segments_processed);
	I->segments_processed = segments_processed;

	return;
}

/* returns integer number for axial interval for tracks traveling in the
 *  positive direction */
int get_pos_interval( float z, float dz)
{
	int interval = (int) (z/dz);
	return interval;
}

/* returns integer number for axial interval for tracks traveling in the 
 * negative direction */
int get_neg_interval( float z, float dz)
{
	// NOTE: a bit of trickery using floors to obtain ceils 
	int interval = INT_MAX - (int) ( (double) INT_MAX 
			- (double) ( z / dz ) );
	return interval;
}

int calc_next_fai( float z, float dz, bool pos_dir)
{
	int interval = z/dz;
	float lower_z = dz * (float) interval;
	if(pos_dir)
		return interval + 1;
	else
		return interval;
}

/* Determines the change in angular flux along a particular track across a fine
 * axial region and tallies the contribution to the scalar flux in the fine 
 * axial region. This function assumes a quadratic source, which is calculated 
 * on the fly using neighboring source values.
 *
 * This legacy function is unused since it is less efficient than the current 
 * attenuate_fluxes function. However, it provides a more straightforward 
 * description of the underlying physical problem. */
void alt_attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I,
		Params * params, float ds, float mu, float az_weight ) 
{
	// compute fine axial interval spacing
	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);

	// compute z height in cell
	float zin = track->z_height - dz * ( (int)( track->z_height / dz ) + 0.5 );

	// compute fine axial region ID
	int fine_id = (int) ( track->z_height / dz ) % I->fai;

	// compute weight (azimuthal * polar)
	// NOTE: real app would also have volume weight component
	float weight = track->p_weight * az_weight;
	float mu2 = mu * mu;

	// load fine source region flux vector
	float * FSR_flux = QSR -> fine_flux[fine_id];

	// cycle over energy groups
	for( int g = 0; g < I->n_egroups; g++)
	{
		// load total cross section
		float sigT = QSR->sigT[g];

		// define source parameters
		float q0, q1, q2;

		// calculate source components
		if( fine_id == 0 )
		{
			// load neighboring sources
			float y2 = QSR->fine_source[fine_id][g];
			float y3 = QSR->fine_source[fine_id+1][g];

			// do linear "fitting"
			float c0 = y2;
			float c1 = (y3 - y2) / dz;

			// calculate q0, q1, q2
			q0 = c0 + c1*zin;
			q1 = c1;
			q2 = 0;
		}
		else if( fine_id == I->fai - 1 )
		{
			// load neighboring sources
			float y1 = QSR->fine_source[fine_id-1][g];
			float y2 = QSR->fine_source[fine_id][g];

			// do linear "fitting"
			float c0 = y2;
			float c1 = (y2 - y1) / dz;

			// calculate q0, q1, q2
			q0 = c0 + c1*zin;
			q1 = c1;
			q2 = 0;
		}		
		else
		{
			// load neighboring sources
			float y1 = QSR->fine_source[fine_id-1][g];
			float y2 = QSR->fine_source[fine_id][g];
			float y3 = QSR->fine_source[fine_id+1][g];

			// do quadratic "fitting"
			float c0 = y2;
			float c1 = (y1 - y3) / (2*dz);
			float c2 = (y1 - 2*y2 + y3) / (2*dz*dz);

			// calculate q0, q1, q2
			q0 = c0 + c1*zin + c2*zin*zin;
			q1 = c1 + 2*c2*zin;
			q2 = c2;
		}

		// calculate common values for efficiency
		float tau = sigT * ds;
		float sigT2 = sigT * sigT;

		// compute exponential ( 1 - exp(-x) ) using table lookup
		float expVal = interpolateTable( params->expTable, tau );  

		// load correct angular flux vector
		float * psi;
		if(forward)
			psi = track->f_psi;
		else
			psi = track->b_psi;

		// add contribution to new source flux
		float flux_integral = (q0 * tau + (sigT * psi[g] - q0) * expVal)
			/ sigT2
			+ q1 * mu * (tau * (tau - 2) + 2 * expVal)
			/ (sigT * sigT2)
			+ q2 * mu2 * (tau * (tau * (tau - 3) + 6) - 6 * expVal)
			/ (3 * sigT2 * sigT2);

		#pragma omp atomic
		FSR_flux[g] += weight * flux_integral;

		// update angular flux
		psi[g] = psi[g] * (1.0 - expVal) + q0 * expVal / sigT
			+ q1 * mu * (tau - expVal) / sigT2 + q2 * mu2 *
			(tau * (tau - 2) + 2 * expVal) / (sigT2 * sigT);
	}
}

/* Determines the change in angular flux along a particular track across a fine
 * axial region and tallies the contribution to the scalar flux in the fine 
 * axial region. This function assumes a constant  source. */ 
void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I,
		Params * params_in, float ds, float mu, float az_weight, 
		AttenuateVars *A)
{
	// upack attenuate vars struct
	float *  restrict tally = A->tally;
	float *  restrict expVal = A->expVal;
	float *  restrict sigT = A->sigT;
	float *  restrict tau = A->tau;

	Params params = * params_in;

	// compute fine axial interval spacing
	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);

	// compute z height in cell
	float zin = track->z_height - dz * 
		( (int)( track->z_height / dz ) + 0.5f );

	// compute fine axial region ID
	int fine_id = (int) ( track->z_height / dz ) % I->fai;

	// compute weight (azimuthal * polar)
	// NOTE: real app would also have volume weight component
	float weight = track->p_weight * az_weight * mu;

	// load fine source region flux vector
	float * FSR_flux = FSR -> fine_flux[fine_id];

	// cycle over energy groups
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I->n_egroups; g++)
	{
		// load total cross section
		sigT[g] = FSR->sigT[g];
		tau[g] = sigT[g] * ds;
	}

	// compute exponential ( 1 - exp(-x) ) using table lookup
	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for(int g = 0; g < I->n_egroups; g++)
	{
		expVal[g] = interpolateTable( params.expTable, tau[g] );
	}

	float * psi;
	if(forward)
		psi = track->f_psi;
	else
		psi = track->b_psi;

	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I->n_egroups; g++)
	{
		// compute angular flux attenuation
		float q = FSR->fine_source[fine_id][g] / sigT[g];
		float delta_psi = (psi[g] - q) * expVal[g];

		// add contribution to new source flux
		tally[g] = weight * delta_psi;

		// update angular flux
		psi[g] -= delta_psi;
	}


	#ifdef OPENMP
	omp_set_lock(&FSR->locks[fine_id]);
	#endif

	#ifdef INTEL
	#pragma simd
	#elif defined IBM
	#pragma simd_level(10)
	#endif
	for( int g = 0; g < I->n_egroups; g++)
	{
		FSR_flux[g] += tally[g];
	}

	#ifdef OPENMP
	omp_unset_lock(&FSR->locks[fine_id]);
	#endif


}

/* Renormalizes scalar and angular flux for next transport sweep iteration.
 * Calculation requires multiple pair-wise sums and a reduction accross all 
 * nodes. */
void renormalize_flux( Params params, Input I, CommGrid grid )
{
	if( I.mype == 0 ) printf("Renormalizing Flux...\n");
	float node_fission_rate = 0;
	#ifdef OPENMP
	#pragma omp parallel default(none) shared(params, I, grid) \
	reduction(+ : node_fission_rate)
	{
	#endif
		// tally total fission rate (pair-wise sum)
		float * fission_rates = malloc( I.n_source_regions_per_node 
				* sizeof(float) );

		float * fine_fission_rates = malloc( I.fai * sizeof(float) );
		float * g_fission_rates = malloc( I.n_egroups * sizeof(float) );

		// accumulate total fission rate on node domain
		#pragma omp for schedule(dynamic)
		for( int i = 0; i < I.n_source_regions_per_node; i++)
		{
			Source src = params.sources[i];
			for( int j = 0; j < I.fai; j++)
			{
				for( int g = 0; g < I.n_egroups; g++)
					g_fission_rates[g] = src.fine_flux[j][g] * src.vol 
						* src.XS[g][0];
				fine_fission_rates[j] = pairwise_sum( g_fission_rates, 
						I.n_egroups );
			}
			fission_rates[i] = pairwise_sum( fine_fission_rates, I.fai );
		}
		node_fission_rate = pairwise_sum(fission_rates, 
				I.n_source_regions_per_node);
		
		// free allocated memory
		free(fission_rates);
		free(fine_fission_rates);
		free(g_fission_rates);
	
	#ifdef OPENMP
	}
	#endif

	#ifdef MPI	
	// accumulate total fission rate by MPI Allreduce
	float total_fission_rate = 0;
	MPI_Barrier(grid.cart_comm_3d);
	MPI_Allreduce( &node_fission_rate, // Send Buffer
			&total_fission_rate,    // Receive Buffer
			1,                    	// Element Count
			MPI_FLOAT,           	// Element Type
			MPI_SUM,              	// Reduciton Operation Type
			grid.cart_comm_3d );  	// MPI Communicator
	MPI_Barrier(grid.cart_comm_3d);
	#else
	float total_fission_rate = node_fission_rate;
	#endif


	// normalize fluxes by fission reaction rate
	float norm_factor = 1.0 / total_fission_rate;

	#pragma omp parallel for default(none) \
	shared(I, params) private(norm_factor) schedule(dynamic)
	for( int i = 0; i < I.n_source_regions_per_node; i++)
	{
		Source * src = &params.sources[i];
		float adjust = norm_factor * 4 * M_PI * I.fai / src->vol;
		for( int k = 0; k < I.fai; k++)
			for( int g = 0; g < I.n_egroups; g++)
				src->fine_flux[k][g] *= adjust;
	}

	// normalize boundary fluxes by same factor
	#pragma omp parallel for default(none) \
	shared(I, params) private(norm_factor) schedule(dynamic)
	for( long i = 0; i < I.ntracks_2D; i++)
		for( int j = 0; j < I.n_polar_angles; j++)
			for( int k = 0; k < I.z_stacked; k++)
				for( int g = 0; g < I.n_egroups; g++)
				{
					params.tracks[i][j][k].f_psi[g] *= norm_factor;
					params.tracks[i][j][k].b_psi[g] *= norm_factor;
				}

	if( I.mype == 0 ) printf("Renormalizing Flux Complete.\n");
	return;
}

/* Updates sources for next iteration by computing scattering and fission
 * components. Calculation includes multiple pair-wise sums and reductions
 * accross all nodes */
float update_sources( Params params, Input I, float keff )
{
	// source residual
	float residual;

	// calculate inverse multiplication facotr for efficiency
	float inverse_k = 1.0 / keff;

	// allocate residual arrays
	float * group_res = (float *) malloc(I.n_egroups * sizeof(float));
	float * fine_res = (float *) malloc(I.fai * sizeof(float));
	float * residuals = (float *) malloc(I.n_source_regions_per_node 
			* sizeof(float));

	// allocate arrays for summation
	float * fission_rates = malloc(I.n_egroups * sizeof(float));
	float * scatter_rates = malloc(I.n_egroups * sizeof(float));

	// cycle through all coarse axial intervals to update source
	for( long i = 0; i < I.n_source_regions_per_node; i++)
	{
		Source src = params.sources[i];

		// cycle thorugh all fine axial regions to calculate new source
		for( int j = 0; j < I.fai; j++)
		{
			// calculate total fission source and scattering source
			float fission_source;
			float scatter_source;

			// compute total fission source
			for( int g = 0; g < I.n_egroups; g++ )
				fission_rates[g] = src.fine_flux[j][g] * src.XS[g][0];
			fission_source = pairwise_sum( fission_rates, (long) I.n_egroups);

			// normalize fission source by multiplication factor
			fission_source *= inverse_k;

			// compute scattering and new total source for each group
			for( int g = 0; g < I.n_egroups; g++ )
			{
				for( int g2 = 0; g2 < I.n_egroups; g2++ )
				{
					// compute scatter source originating from g2 -> g
					scatter_rates[g2] = src.scattering_matrix[g][g2] * 
						src.fine_flux[j][g2];
				}
				scatter_source = pairwise_sum(scatter_rates, 
						(long) I.n_egroups);

				// compuate new total source
				float chi = src.XS[g][2];

				// calculate new fine source
				float newSrc = (fission_source * chi + scatter_source) 
					/ (4.0 * M_PI);

				// calculate residual
				float oldSrc = src.fine_source[j][g];
				group_res[g] = (newSrc - oldSrc) * (newSrc - oldSrc)
					/ (oldSrc * oldSrc);

				/* calculate new source in fine axial interval assuming 
				 * isotropic source components */
				src.fine_source[j][g] = newSrc;
			}
			fine_res[j] = pairwise_sum(group_res, (long) I.n_egroups);
		}
		residuals[i] = pairwise_sum(fine_res, (long) I.fai);
	}

	// calculate source residual
	residual = pairwise_sum(residuals, I.n_source_regions_per_node);

	// free memory
	free(fission_rates);
	free(scatter_rates);
	free(group_res);
	free(fine_res);
	free(residuals);


	// NOTE: See code around line 600 of CPUSolver.cpp in ClosedMOC/ OpenMOC

	return residual;
}

/* Computes globall k-effective using multiple pair-wise summations and finally
 * a reduction accross all nodes */
float compute_keff(Params params, Input I, CommGrid grid)
{
	// allocate temporary memory
	float * sigma = malloc( I.n_egroups * sizeof(float) );
	float * group_rates = malloc( I.n_egroups * sizeof(float) );
	float * fine_rates = malloc( I.fai * sizeof(float) );
	float * QSR_rates = malloc( I.n_source_regions_per_node * sizeof(float) );

	///////////////////////////////////////////////////////////////////////////

	// compute total absorption rate, looping over source regions
	for( long i = 0; i < I.n_source_regions_per_node; i++)
	{
		// load absorption XS data
		Source src = params.sources[i];
		for( int g = 0; g < I.n_egroups; g++)	
			sigma[g] = src.XS[g][1];

		for( int j = 0; j < I.fai; j++ )
		{
			// calculate absorption rates
			float * fine_flux = src.fine_flux[j];
			for( int g = 0; g < I.n_egroups; g++)
				group_rates[g] = sigma[g] * fine_flux[g];

			// sum absorption over all energy groups
			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
		}
		// sum absorption over all fine axial intervals
		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
	}
	// sum absorption over all source regions in a node
	float node_abs = pairwise_sum( QSR_rates, I.n_source_regions_per_node);

	///////////////////////////////////////////////////////////////////////////

	// compute total absorption rate, looping over source regions
	for( long i = 0; i < I.n_source_regions_per_node; i++)
	{
		// load nuSigmaF XS data
		Source src = params.sources[i];
		for( int g = 0; g < I.n_egroups; g++)	
			sigma[g] = src.XS[g][0];

		for( int j = 0; j < I.fai; j++ )
		{
			// calculate absorption rates
			float * fine_flux = src.fine_flux[j];
			for( int g = 0; g < I.n_egroups; g++)
				group_rates[g] = sigma[g] * fine_flux[g];

			// sum fission over all energy groups
			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
		}
		// sum fission over all fine axial intervals
		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
	}
	// sum fission over all source regions in a node
	float node_fission = pairwise_sum( QSR_rates, I.n_source_regions_per_node);

	///////////////////////////////////////////////////////////////////////////

	// MPi Reduction
	float tot_abs = 0;
	float tot_fission = 0;
	float leakage = 0;

	#ifdef MPI 

	// Total Absorption Reduction
	MPI_Reduce( &node_abs,    		// Send Buffer
			&tot_abs,      			// Receive Buffer
			1,                  	// Element Count
			MPI_FLOAT,          	// Element Type
			MPI_SUM,            	// Reduciton Operation Type
			0,                  	// Master Rank
			grid.cart_comm_3d );	// MPI Communicator

	// Total Fission Reduction
	MPI_Reduce( &node_fission,     	// Send Buffer
			&tot_fission,  			// Receive Buffer
			1,                    	// Element Count
			MPI_FLOAT,           	// Element Type
			MPI_SUM,              	// Reduciton Operation Type
			0,                    	// Master Rank
			grid.cart_comm_3d );  	// MPI Communicator

	// Total Leakage Reduction
	MPI_Reduce( params.leakage,  	// Send Buffer
			&leakage,      			// Receive Buffer
			1,                    	// Element Count
			MPI_FLOAT,           	// Element Type
			MPI_SUM,              	// Reduciton Operation Type
			0,                    	// Master Rank
			grid.cart_comm_3d );  	// MPI Communicator

	MPI_Barrier(grid.cart_comm_3d);

	// calculate keff
	float keff = tot_fission/ (tot_abs + leakage);
	#else
	float keff = node_fission / (node_abs + *params.leakage);
	#endif

	///////////////////////////////////////////////////////////////////////////

	// free memory
	free(sigma);
	free(group_rates);
	free(fine_rates);
	free(QSR_rates);

	return keff;
}

/* Interpolates a formed exponential table to compute ( 1- exp(-x) )
 *  at the desired x value */
float interpolateTable( Table table, float x)
{
	// check to ensure value is in domain
	if( x > table.maxVal )
		return 1.0f;
	else
	{
		int interval = (int) ( x / table.dx + 0.5f * table.dx );
		/*
		   if( interval >= table.N || interval < 0)
		   {
		   printf( "Interval = %d\n", interval);
		   printf( "N = %d\n", table.N);
		   printf( "x = %f\n", x);
		   printf( "dx = %f\n", table.dx);
		   exit(1);
		   }
		   */
		float slope = table.values[ 2 * interval ];
		float intercept = table.values[ 2 * interval + 1 ];
		float val = slope * x + intercept;
		return val;
	}
}