diff --git a/data/mie_lut_broadband.nc b/data/mie_lut_broadband.nc
index 0970b47c..f0ce2ae4 100644
Binary files a/data/mie_lut_broadband.nc and b/data/mie_lut_broadband.nc differ
diff --git a/data/mie_lut_visualisation.nc b/data/mie_lut_visualisation.nc
index 5fc50ae1..d2479895 100644
Binary files a/data/mie_lut_visualisation.nc and b/data/mie_lut_visualisation.nc differ
diff --git a/include_rt/Gas_optics_rrtmgp_rt.h b/include_rt/Gas_optics_rrtmgp_rt.h
index ea296ad2..f24cd774 100644
--- a/include_rt/Gas_optics_rrtmgp_rt.h
+++ b/include_rt/Gas_optics_rrtmgp_rt.h
@@ -145,12 +145,12 @@ class Gas_optics_rrtmgp_rt : public Gas_optics_rt
                 const Array_gpu<Float,2>& play,
                 const Array_gpu<Float,2>& plev,
                 const Array_gpu<Float,2>& tlay,
+                const Array_gpu<Float,2>& tlev,
                 const Array_gpu<Float,1>& tsfc,
                 const Gas_concs_gpu& gas_desc,
                 std::unique_ptr<Optical_props_arry_rt>& optical_props,
                 Source_func_lw_rt& sources,
-                const Array_gpu<Float,2>& col_dry,
-                const Array_gpu<Float,2>& tlev);
+                const Array_gpu<Float,2>& col_dry);
 
         // shortwave variant
         void gas_optics(
@@ -219,7 +219,7 @@ class Gas_optics_rrtmgp_rt : public Gas_optics_rt
         Array<Float,4> krayl;
 
         int idx_h2o;
-        
+
         Array_gpu<Float,1> solar_source_g;
         Array_gpu<Float,2> totplnk_gpu;
         Array_gpu<Float,4> planck_frac_gpu;
@@ -294,7 +294,7 @@ class Gas_optics_rrtmgp_rt : public Gas_optics_rt
                 const Float md_index, const Float sb_index);
 
         void compute_gas_taus(
-                const int col_s, const int ncol_block, const int ncol, const int nlay, 
+                const int col_s, const int ncol_block, const int ncol, const int nlay,
                 const int ngpt, const int nband, const int igpt,
                 const Array_gpu<Float,2>& play,
                 const Array_gpu<Float,2>& plev,
@@ -313,7 +313,7 @@ class Gas_optics_rrtmgp_rt : public Gas_optics_rt
                 std::unique_ptr<Optical_props_arry_rt>& optical_props);
 
         void source(
-                const int ncol, const int nlay, const int nband, const int ngpt, const int igpt,
+                const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband, const int ngpt, const int igpt,
                 const Array_gpu<Float,2>& play, const Array_gpu<Float,2>& plev,
                 const Array_gpu<Float,2>& tlay, const Array_gpu<Float,1>& tsfc,
                 const Array_gpu<int,2>& jtemp, const Array_gpu<int,2>& jpress,
diff --git a/include_rt/Gas_optics_rt.h b/include_rt/Gas_optics_rt.h
index 30173839..f3f6c519 100644
--- a/include_rt/Gas_optics_rt.h
+++ b/include_rt/Gas_optics_rt.h
@@ -61,12 +61,12 @@ class Gas_optics_rt : public Optical_props_rt
                 const Array_gpu<Float,2>& play,
                 const Array_gpu<Float,2>& plev,
                 const Array_gpu<Float,2>& tlay,
+                const Array_gpu<Float,2>& tlev,
                 const Array_gpu<Float,1>& tsfc,
                 const Gas_concs_gpu& gas_desc,
                 std::unique_ptr<Optical_props_arry_rt>& optical_props,
                 Source_func_lw_rt& sources,
-                const Array_gpu<Float,2>& col_dry,
-                const Array_gpu<Float,2>& tlev) = 0;
+                const Array_gpu<Float,2>& col_dry) = 0;
 
         // Shortwave variant.
         virtual void gas_optics(
@@ -80,7 +80,7 @@ class Gas_optics_rt : public Optical_props_rt
                 const Array_gpu<Float,2>& col_dry) = 0;
 
        virtual Float get_tsi() const = 0;
-       
+
        virtual Float band_source(const int gpt_start, const int gpt_end) const = 0;
 };
 #endif
diff --git a/include_rt/Source_functions_rt.h b/include_rt/Source_functions_rt.h
index 7d55006d..36528f8c 100644
--- a/include_rt/Source_functions_rt.h
+++ b/include_rt/Source_functions_rt.h
@@ -24,7 +24,7 @@
 
 #ifndef SOURCE_FUNCTIONS_RT_H
 #define SOURCE_FUNCTIONS_RT_H
-#include "Optical_props_rt.h" 
+#include "Optical_props_rt.h"
 
 template<typename, int> class Array_gpu;
 
@@ -48,21 +48,18 @@ class Source_func_lw_rt : public Optical_props_rt
         Array_gpu<Float,1>& get_sfc_source()     { return sfc_source;     }
         Array_gpu<Float,1>& get_sfc_source_jac() { return sfc_source_jac; }
         Array_gpu<Float,2>& get_lay_source()     { return lay_source;     }
-        Array_gpu<Float,2>& get_lev_source_inc() { return lev_source_inc; }
-        Array_gpu<Float,2>& get_lev_source_dec() { return lev_source_dec; }
+        Array_gpu<Float,2>& get_lev_source()     { return lev_source; }
 
         const Array_gpu<Float,1>& get_sfc_source()     const { return sfc_source;     }
         const Array_gpu<Float,1>& get_sfc_source_jac() const { return sfc_source_jac; }
         const Array_gpu<Float,2>& get_lay_source()     const { return lay_source;     }
-        const Array_gpu<Float,2>& get_lev_source_inc() const { return lev_source_inc; }
-        const Array_gpu<Float,2>& get_lev_source_dec() const { return lev_source_dec; }
+        const Array_gpu<Float,2>& get_lev_source()     const { return lev_source; }
 
     private:
         Array_gpu<Float,1> sfc_source;
         Array_gpu<Float,1> sfc_source_jac;
         Array_gpu<Float,2> lay_source;
-        Array_gpu<Float,2> lev_source_inc;
-        Array_gpu<Float,2> lev_source_dec;
+        Array_gpu<Float,2> lev_source;
 };
 
 #endif
diff --git a/include_rt/raytracer_functions.h b/include_rt/raytracer_functions.h
index 8e614d50..4ca88988 100644
--- a/include_rt/raytracer_functions.h
+++ b/include_rt/raytracer_functions.h
@@ -87,104 +87,62 @@ namespace Raytracer_functions
     }
 
     __device__
-    inline Float mie_sample_angle(const Float* mie_cdf, const Float* mie_lut, const Float random_number, const Float r_eff, const int n_mie)
+    inline int find_index(const float* mie_cdf, const int size, const float random_number)
     {
-        // interpolation over effective radius. Currently, r_eff should range between 2.5 and 21.5 (similar to RRTMGP) OR
-        // be exactly 100 micrometer for optical effects such as rainbows
-        const int r_idx = (r_eff == Float(100.)) ? 20 : min(max(int(r_eff-2.5), 0), 18);
-        const Float r_rest = fmod(r_eff-Float(2.5),Float(1.));
+        int left = 0;
+        int right = size - 1;
 
-        int i = 0;
-        while (random_number < mie_cdf[i])
-        {
-            ++i;
-        }
+        while (left < right) {
+            int mid = left + (right - left) / 2;
 
-        // sampled scattering angle
-        Float ang;
-        if (r_idx < 20)
-        {
-            if (i==0)
-            {
-                const Float ang_lwr = mie_lut[r_idx*n_mie]*(1-r_rest);
-                const Float ang_upr = mie_lut[(r_idx+1)*n_mie]*r_rest;
-                ang = ang_lwr + ang_upr;
-            }
-            else
-            {
-                const int midx_lwr = r_idx*n_mie;
-                const int midx_upr = (r_idx+1)*n_mie;
-                const Float dr = abs(mie_cdf[i] - mie_cdf[i-1]);
-
-                const Float ang_lwr = (abs(random_number - mie_cdf[i])*mie_lut[(i-1)+midx_lwr] + abs(mie_cdf[i-1]-random_number)*mie_lut[i+midx_lwr]) / dr;
-                const Float ang_upr = (abs(random_number - mie_cdf[i])*mie_lut[(i-1)+midx_upr] + abs(mie_cdf[i-1]-random_number)*mie_lut[i+midx_upr]) / dr;
-                ang = ang_lwr * (1-r_rest) + ang_upr * r_rest;
+            if (random_number >= mie_cdf[mid]) {
+                right = mid;
+            } else {
+                left = mid + 1;
             }
         }
-        else
-        {
-            if (i==0)
-            {
-                ang = mie_lut[r_idx*n_mie];
-            }
-            else
-            {
-                const int midx = r_idx*n_mie;
-                const Float dr = abs(mie_cdf[i] - mie_cdf[i-1]);
 
-                ang = (abs(random_number - mie_cdf[i])*mie_lut[(i-1)+midx] + abs(mie_cdf[i-1]-random_number)*mie_lut[i+midx]) / dr;
-            }
-        }
+        return left - 1;
+    }
+
+    __device__
+    inline Float mie_sample_angle(const Float* mie_cdf, const Float* mie_lut, const Float random_number, const Float r_eff, const int n_mie)
+    {
+        // interpolation over effective radius. Currently, r_eff should range between 2.5 and 21.5 (similar to RRTMGP)
+        const int r_idx = min(max(int(r_eff-2.5), 0), 18);
+        const Float r_rest = fmod(r_eff-Float(2.5),Float(1.));
+
+        const int i = min(max(0, find_index(mie_cdf, n_mie, random_number)), n_mie - 2);
+
+        const int midx_lwr = r_idx*n_mie;
+        const int midx_upr = (r_idx+1)*n_mie;
+        const Float dr = abs(mie_cdf[i+1] - mie_cdf[i]);
+
+        const Float ang_lwr = (abs(random_number - mie_cdf[i+1])*mie_lut[(i)+midx_lwr] + abs(mie_cdf[i]-random_number)*mie_lut[i+midx_lwr+1]) / dr;
+        const Float ang_upr = (abs(random_number - mie_cdf[i+1])*mie_lut[(i)+midx_upr] + abs(mie_cdf[i]-random_number)*mie_lut[i+midx_upr+1]) / dr;
+        const Float ang = ang_lwr * (1-r_rest) + ang_upr * r_rest;
         return ang;
     }
 
     __device__
     inline Float mie_interpolate_phase_table(const Float* mie_phase, const Float* mie_lut, const Float scat_ang, const Float r_eff, const int n_mie)
     {
-        // interpolation over effective radius. Currently, r_eff should range between 2.5 and 21.5 (similar to RRTMGP) OR
-        // be exactly 100 micrometer for optical effects such as rainbows
-        const int r_idx = (r_eff == Float(100.)) ? 20 : min(max(int(r_eff-2.5), 0), 18);
+        // interpolation over effective radius. Currently, r_eff should range between 2.5 and 21.5 (similar to RRTMGP)
+        const int r_idx = min(max(int(r_eff-2.5), 0), 18);
         const Float r_rest = fmod(r_eff-Float(2.5),Float(1.));
 
         // interpolation between 1800 equally spaced scattering angles between 0 and PI (both inclusive).
-        const Float d_pi = Float(1.74629942e-03);
-        const int i = min(max(0, int(1800-(scat_ang/d_pi+1))), 1798);
+        constexpr Float d_pi = Float(1.74629942e-03);
+        const int i = min(max(0, int(scat_ang/d_pi)), 1798);
 
-        // probability (of scattering at angle scat_ang)
-        Float prob;
-        if (r_idx < 20)
-        {
-            if (i==0)
-            {
-                const Float prob_lwr = mie_lut[r_idx*n_mie]*(1-r_rest);
-                const Float prob_upr = mie_lut[(r_idx+1)*n_mie]*r_rest;
-                prob = prob_lwr + prob_upr;
-            }
-            else
-            {
-                const int midx_lwr = r_idx*n_mie;
-                const int midx_upr = (r_idx+1)*n_mie;
-                const Float dr = abs(mie_phase[i] - mie_phase[i-1]);
-
-                const Float prob_lwr = (abs(scat_ang - mie_phase[i])*mie_lut[(i-1)+midx_lwr] + abs(mie_phase[i-1]-scat_ang)*mie_lut[i+midx_lwr]) / dr;
-                const Float prob_upr = (abs(scat_ang - mie_phase[i])*mie_lut[(i-1)+midx_upr] + abs(mie_phase[i-1]-scat_ang)*mie_lut[i+midx_upr]) / dr;
-                prob = prob_lwr * (1-r_rest) + prob_upr * r_rest;
-            }
-        }
-        else
-        {
-            if (i==0)
-            {
-                prob = mie_lut[r_idx*n_mie];
-            }
-            else
-            {
-                const int midx = r_idx*n_mie;
-                const Float dr = abs(mie_phase[i] - mie_phase[i-1]);
+        const int midx_lwr = r_idx*n_mie;
+        const int midx_upr = (r_idx+1)*n_mie;
+        const Float dr = abs(mie_phase[i+1] - mie_phase[i]);
+
+        const Float prob_lwr = (abs(scat_ang - mie_phase[i+1])*mie_lut[(i)+midx_lwr] + abs(mie_phase[i]-scat_ang)*mie_lut[i+1+midx_lwr]) / dr;
+        const Float prob_upr = (abs(scat_ang - mie_phase[i+1])*mie_lut[(i)+midx_upr] + abs(mie_phase[i]-scat_ang)*mie_lut[i+1+midx_upr]) / dr;
+        const Float prob = prob_lwr * (1-r_rest) + prob_upr * r_rest;
 
-                prob = (abs(scat_ang - mie_phase[i])*mie_lut[(i-1)+midx] + abs(mie_phase[i-1]-scat_ang)*mie_lut[i+midx]) / dr;
-            }
-        }
         return prob;
     }
 
diff --git a/include_rt_kernels/gas_optics_rrtmgp_kernels_cuda_rt.h b/include_rt_kernels/gas_optics_rrtmgp_kernels_cuda_rt.h
index 73358260..3cf09cb3 100644
--- a/include_rt_kernels/gas_optics_rrtmgp_kernels_cuda_rt.h
+++ b/include_rt_kernels/gas_optics_rrtmgp_kernels_cuda_rt.h
@@ -33,10 +33,10 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
     void reorder123x321(const int ni, const int nj, const int nk,
             const Float* arr_in, Float* arr_out);
 
-    
+
     void reorder12x21(const int ni, const int nj, const Float* arr_in, Float* arr_out);
 
-    
+
     void zero_array(const int ni, const int nj, const int nk, const int nn, Float* arr);
     void zero_array(const int ni, const int nj, const int nk, Float* arr);
     void zero_array(const int ni, const int nj, Float* arr);
@@ -67,7 +67,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
     void combine_abs_and_rayleigh(
             const int col_s, const int ncol_sub, const int ncol, const int nlay,
-            const Float* tau_local, const Float* tau_rayleigh,
+            const Float* tau_rayleigh,
             Float* tau, Float* ssa, Float* g);
 
 
@@ -84,7 +84,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
 
     void compute_tau_absorption(
-            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband, 
+            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband,
             const int ngpt, const int igpt,
             const int ngas, const int nflav, const int neta, const int npres, const int ntemp,
             const int nminorlower, const int nminorklower,
@@ -117,8 +117,8 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
             Float* tau);
 
 
-    void Planck_source(
-            const int ncol, const int nlay, const int nbnd, const int ngpt, const int igpt,
+    void compute_planck_source(
+            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nbnd, const int ngpt, const int igpt,
             const int nflav, const int neta, const int npres, const int ntemp,
             const int nPlanckTemp,
             const Float* tlay,
@@ -137,8 +137,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
             const Float* totplnk,
             Float* sfc_src,
             Float* lay_src,
-            Float* lev_src_inc,
-            Float* lev_src_dec,
+            Float* lev_src,
             Float* sfc_src_jac);
 }
 #endif
diff --git a/include_rt_kernels/raytracer_kernels_bw.h b/include_rt_kernels/raytracer_kernels_bw.h
index af7c0e4f..2de1c224 100644
--- a/include_rt_kernels/raytracer_kernels_bw.h
+++ b/include_rt_kernels/raytracer_kernels_bw.h
@@ -17,8 +17,8 @@ using namespace Raytracer_functions;
 
 
 #ifdef RTE_USE_SP
-constexpr int bw_kernel_block= 512;
-constexpr int bw_kernel_grid = 1024;
+constexpr int bw_kernel_block= 128;
+constexpr int bw_kernel_grid = 2048;
 #else
 constexpr int bw_kernel_block = 256;
 constexpr int bw_kernel_grid = 256;
diff --git a/include_rt_kernels/rte_solver_kernels_cuda_rt.h b/include_rt_kernels/rte_solver_kernels_cuda_rt.h
index 14793b9c..cbe47b70 100644
--- a/include_rt_kernels/rte_solver_kernels_cuda_rt.h
+++ b/include_rt_kernels/rte_solver_kernels_cuda_rt.h
@@ -32,25 +32,25 @@
 namespace Rte_solver_kernels_cuda_rt
 {
     void apply_BC(
-            const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+            const int ncol, const int nlay, const Bool top_at_1,
             const Float* inc_flux_dir, const Float* mu0, Float* gpt_flux_dir);
 
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, Float* gpt_flux_dn);
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, Float* gpt_flux_dn);
 
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float* inc_flux_dif, Float* gpt_flux_dn);
-    
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float inc_flux, Float* gpt_flux_dn);
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, const Float* inc_flux_dif, Float* gpt_flux_dn);
+
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, const Float inc_flux, Float* gpt_flux_dn);
 
     void sw_solver_2stream(
-            const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+            const int ncol, const int nlay, const Bool top_at_1,
             const Float* tau, const Float* ssa, const Float* g,
             const Float* mu0, const Float* sfc_alb_dir, const Float* sfc_alb_dif,
             Float* flux_up, Float* flux_dn, Float* flux_dir);
 
     void lw_solver_noscat_gaussquad(
-            const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const int nmus,
+            const int ncol, const int nlay, const Bool top_at_1, const int nmus,
             const Float* ds, const Float* weights, const Float* tau, const Float* lay_source,
-            const Float* lev_source_inc, const Float* lev_source_dec, const Float* sfc_emis,
+            const Float* lev_source, const Float* sfc_emis,
             const Float* sfc_src, Float* flux_up, Float* flux_dn,
             const Float* sfc_src_jac, Float* flux_up_jac);
 }
diff --git a/include_test/Radiation_solver_rt.h b/include_test/Radiation_solver_rt.h
index 055fdb7c..58ed0833 100644
--- a/include_test/Radiation_solver_rt.h
+++ b/include_test/Radiation_solver_rt.h
@@ -45,18 +45,26 @@ class Radiation_solver_longwave
         #ifdef __CUDACC__
         void solve_gpu(
                 const bool switch_fluxes,
+                const bool switch_raytracing,
                 const bool switch_cloud_optics,
+                const bool switch_aerosol_optics,
                 const bool switch_single_gpt,
                 const int single_gpt,
+                const Int ray_count,
+                const Vector<int> grid_cells,
+                const Vector<Float> grid_d,
+                const Vector<int> kn_grid,
                 const Gas_concs_gpu& gas_concs,
+                Aerosol_concs_gpu& aerosol_concs,
                 const Array_gpu<Float,2>& p_lay, const Array_gpu<Float,2>& p_lev,
                 const Array_gpu<Float,2>& t_lay, const Array_gpu<Float,2>& t_lev,
                 Array_gpu<Float,2>& col_dry,
                 const Array_gpu<Float,1>& t_sfc, const Array_gpu<Float,2>& emis_sfc,
                 const Array_gpu<Float,2>& lwp, const Array_gpu<Float,2>& iwp,
                 const Array_gpu<Float,2>& rel, const Array_gpu<Float,2>& dei,
-                Array_gpu<Float,2>& tau, Array_gpu<Float,2>& lay_source,
-                Array_gpu<Float,2>& lev_source_inc, Array_gpu<Float,2>& lev_source_dec, Array_gpu<Float,1>& sfc_source,
+                const Array_gpu<Float,2>& rh,
+                Array_gpu<Float,2>& tot_tau_out, Array_gpu<Float,2>& cld_tau_out, Array_gpu<Float,2>& lay_source,
+                Array_gpu<Float,2>& lev_source, Array_gpu<Float,1>& sfc_source,
                 Array_gpu<Float,2>& lw_flux_up, Array_gpu<Float,2>& lw_flux_dn, Array_gpu<Float,2>& lw_flux_net,
                 Array_gpu<Float,2>& lw_gpt_flux_up, Array_gpu<Float,2>& lw_gpt_flux_dn, Array_gpu<Float,2>& lw_gpt_flux_net);
 
@@ -74,6 +82,7 @@ class Radiation_solver_longwave
         #ifdef __CUDACC__
         std::unique_ptr<Gas_optics_rrtmgp_rt> kdist_gpu;
         std::unique_ptr<Cloud_optics_rt> cloud_optics_gpu;
+        std::unique_ptr<Aerosol_optics_rt> aerosol_optics_gpu;
         Rte_lw_rt rte_lw;
 
         std::unique_ptr<Optical_props_arry_rt> optical_props;
@@ -81,6 +90,8 @@ class Radiation_solver_longwave
         std::unique_ptr<Source_func_lw_rt> sources;
 
         std::unique_ptr<Optical_props_1scl_rt> cloud_optical_props;
+
+        std::unique_ptr<Optical_props_1scl_rt> aerosol_optical_props;
         #endif
 };
 
@@ -124,7 +135,7 @@ class Radiation_solver_shortwave
                 const Array_gpu<Float,2>& lwp, const Array_gpu<Float,2>& iwp,
                 const Array_gpu<Float,2>& rel, const Array_gpu<Float,2>& dei,
                 const Array_gpu<Float,2>& rh,
-                const Aerosol_concs_gpu& aerosol_concs,
+                Aerosol_concs_gpu& aerosol_concs,
                 Array_gpu<Float,2>& tot_tau_out, Array_gpu<Float,2>& tot_ssa_out,
                 Array_gpu<Float,2>& cld_tau_out, Array_gpu<Float,2>& cld_ssa_out, Array_gpu<Float,2>& cld_asy_out,
                 Array_gpu<Float,2>& aer_tau_out, Array_gpu<Float,2>& aer_ssa_out, Array_gpu<Float,2>& aer_asy_out,
diff --git a/python/set_virtual_camera.py b/python/set_virtual_camera.py
index 10a6a13f..195f9b70 100644
--- a/python/set_virtual_camera.py
+++ b/python/set_virtual_camera.py
@@ -63,7 +63,7 @@
     cam["yaw"][:]   = 0
     cam["pitch"][:] = -90
     cam["roll"][:]  = 0
-    cam["cam_type"][:]= 1
+    cam["cam_type"][:]= 0
     cam["fov"][:] = 80
     cam["px"][:] = 0
     cam["py"][:] = 0
@@ -76,7 +76,7 @@
     cam["yaw"][:] = 0
     cam["pitch"][:] = 0
     cam["roll"][:]  = 0
-    cam["cam_type"][:]= 0
+    cam["cam_type"][:]= 1
     cam["fov"][:] = 80
     cam["px"][:] = 0.
     cam["py"][:] = 0.
diff --git a/src_cuda_rt/Gas_optics_rrtmgp_rt.cu b/src_cuda_rt/Gas_optics_rrtmgp_rt.cu
index 3cd02dea..390611e3 100644
--- a/src_cuda_rt/Gas_optics_rrtmgp_rt.cu
+++ b/src_cuda_rt/Gas_optics_rrtmgp_rt.cu
@@ -809,7 +809,7 @@ void Gas_optics_rrtmgp_rt::init_abs_coeffs(
     this->planck_frac_gpu = this->planck_frac;
 }
 
-    
+
 void Gas_optics_rrtmgp_rt::find_relevant_gases_gpt(
     const int igpt,
     std::vector<int>& gases)
@@ -867,7 +867,7 @@ void Gas_optics_rrtmgp_rt::find_relevant_gases_gpt(
 
 __global__
 void fill_gases_kernel(
-        const int col_s, const int ncol_sub, const int ncol, const int nlay, 
+        const int col_s, const int ncol_sub, const int ncol, const int nlay,
         const int dim1, const int dim2, const int ngas, const int igas,
         const Float* __restrict__ vmr_in,
         Float* __restrict__ col_gas, const Float* __restrict__ col_dry)
@@ -1013,12 +1013,12 @@ void Gas_optics_rrtmgp_rt::gas_optics(
         const Array_gpu<Float,2>& play,
         const Array_gpu<Float,2>& plev,
         const Array_gpu<Float,2>& tlay,
+        const Array_gpu<Float,2>& tlev,
         const Array_gpu<Float,1>& tsfc,
         const Gas_concs_gpu& gas_desc,
         std::unique_ptr<Optical_props_arry_rt>& optical_props,
         Source_func_lw_rt& sources,
-        const Array_gpu<Float,2>& col_dry,
-        const Array_gpu<Float,2>& tlev)
+        const Array_gpu<Float,2>& col_dry)
 {
     const int nlay = play.dim(2);
     const int ngpt = this->get_ngpt();
@@ -1026,16 +1026,16 @@ void Gas_optics_rrtmgp_rt::gas_optics(
 
     if (jtemp.size()==0)
     {
-        this->jtemp.set_dims({play.dim(1), play.dim(2)});
-        this->jpress.set_dims({play.dim(1), play.dim(2)});
-        this->tropo.set_dims({play.dim(1), play.dim(2)});
-        this->fmajor.set_dims({2, 2, 2, play.dim(1), play.dim(2)});
-        this->jeta.set_dims({2, play.dim(1), play.dim(2)});
+        this->jtemp.set_dims({ncol_sub, nlay});
+        this->jpress.set_dims({ncol_sub, nlay});
+        this->tropo.set_dims({ncol_sub, nlay});
+        this->fmajor.set_dims({2, 2, 2, ncol_sub, nlay});
+        this->jeta.set_dims({2, ncol_sub, nlay});
     }
 
     // Gas optics.
     compute_gas_taus(
-            1, ncol_sub, ncol, nlay, ngpt, nband, igpt,
+            col_s, ncol_sub, ncol, nlay, ngpt, nband, igpt,
             play, plev, tlay, gas_desc,
             optical_props,
             this->jtemp, this->jpress, this->jeta, this->tropo, this->fmajor,
@@ -1043,7 +1043,7 @@ void Gas_optics_rrtmgp_rt::gas_optics(
 
     // External sources.
     source(
-            ncol, nlay, nband, ngpt, igpt,
+            col_s, ncol_sub, ncol, nlay, nband, ngpt, igpt,
             play, plev, tlay, tsfc,
             this->jtemp, this->jpress, this->jeta, this->tropo, this->fmajor,
             sources, tlev);
@@ -1053,7 +1053,7 @@ void Gas_optics_rrtmgp_rt::gas_optics(
 // Gas optics solver shortwave variant.
 
 void Gas_optics_rrtmgp_rt::gas_optics(
-        const int igpt, 
+        const int igpt,
         const int col_s, const int ncol_sub, const int ncol,
         const Array_gpu<Float,2>& play,
         const Array_gpu<Float,2>& plev,
@@ -1144,7 +1144,7 @@ void Gas_optics_rrtmgp_rt::compute_gas_taus(
         fill_gases_kernel<<<grid_gpu, block_gpu>>>(
             col_s, ncol_sub, ncol, nlay, vmr_2d.dim(1), vmr_2d.dim(2), ngas, igas, vmr_2d.ptr(), col_gas.ptr(), col_dry.ptr());
     }
-    
+
     Gas_optics_rrtmgp_kernels_cuda_rt::interpolation(
             col_s, ncol_sub, ncol, nlay, igpt,
             ngas, nflav, neta, npres, ntemp,
@@ -1181,9 +1181,7 @@ void Gas_optics_rrtmgp_rt::compute_gas_taus(
 
     if (has_rayleigh)
     {
-        Array_gpu<Float,2> tau({ncol_sub, nlay});
         Array_gpu<Float,2> tau_rayleigh({ncol_sub, nlay});
-        Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(nlay, ncol_sub, tau.ptr());
         Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(ncol_sub, nlay, tau_rayleigh.ptr());
 
         Gas_optics_rrtmgp_kernels_cuda_rt::compute_tau_absorption(
@@ -1214,7 +1212,7 @@ void Gas_optics_rrtmgp_rt::compute_gas_taus(
                 col_mix.ptr(), fmajor.ptr(), fminor.ptr(),
                 play.ptr(), tlay.ptr(), col_gas.ptr(),
                 jeta.ptr(), jtemp.ptr(), jpress.ptr(),
-                tau.ptr());
+                optical_props->get_tau().ptr());
 
         Gas_optics_rrtmgp_kernels_cuda_rt::compute_tau_rayleigh(
                 col_s, ncol_sub, ncol, nlay, nband, ngpt, igpt,
@@ -1228,13 +1226,11 @@ void Gas_optics_rrtmgp_rt::compute_gas_taus(
 
         Gas_optics_rrtmgp_kernels_cuda_rt::combine_abs_and_rayleigh(
                 col_s, ncol_sub, ncol, nlay,
-                tau.ptr(), tau_rayleigh.ptr(),
+                tau_rayleigh.ptr(),
                 optical_props->get_tau().ptr(), optical_props->get_ssa().ptr(), optical_props->get_g().ptr());
     }
     else
     {
-        Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(ncol, nlay, optical_props->get_tau().ptr());
-
         Gas_optics_rrtmgp_kernels_cuda_rt::compute_tau_absorption(
                 col_s, ncol_sub, ncol, nlay, nband, ngpt, igpt,
                 ngas, nflav, neta, npres, ntemp,
@@ -1267,22 +1263,8 @@ void Gas_optics_rrtmgp_rt::compute_gas_taus(
     }
 }
 
-// void Gas_optics_rrtmgp_rt::combine_abs_and_rayleigh(
-//         const Array_gpu<Float,2>& tau,
-//         const Array_gpu<Float,2>& tau_rayleigh,
-//         std::unique_ptr<Optical_props_arry_rt>& optical_props)
-// {
-//     int ncol = tau.dim(1);
-//     int nlay = tau.dim(2);
-//
-//     Gas_optics_rrtmgp_kernels_cuda_rt::combine_abs_and_rayleigh(
-//             ncol, nlay,
-//             tau, tau_rayleigh,
-//             optical_props->get_tau(), optical_props->get_ssa(), optical_props->get_g());
-// }
-
 void Gas_optics_rrtmgp_rt::source(
-        const int ncol, const int nlay, const int nbnd, const int ngpt, const int igpt,
+        const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nbnd, const int ngpt, const int igpt,
         const Array_gpu<Float,2>& play, const Array_gpu<Float,2>& plev,
         const Array_gpu<Float,2>& tlay, const Array_gpu<Float,1>& tsfc,
         const Array_gpu<int,2>& jtemp, const Array_gpu<int,2>& jpress,
@@ -1301,15 +1283,15 @@ void Gas_optics_rrtmgp_rt::source(
 
     int sfc_lay = play({1, 1}) > play({1, nlay}) ? 1 : nlay;
 
-    Gas_optics_rrtmgp_kernels_cuda_rt::Planck_source(
-            ncol, nlay, nbnd, ngpt, igpt,
+    Gas_optics_rrtmgp_kernels_cuda_rt::compute_planck_source(
+            col_s, ncol_sub, ncol, nlay, nbnd, ngpt, igpt,
             nflav, neta, npres, ntemp, nPlanckTemp,
             tlay.ptr(), tlev.ptr(), tsfc.ptr(), sfc_lay,
             fmajor.ptr(), jeta.ptr(), tropo.ptr(), jtemp.ptr(), jpress.ptr(),
             gpoint_bands.ptr(), band_lims_gpoint.ptr(), this->planck_frac_gpu.ptr(), this->temp_ref_min,
             this->totplnk_delta, this->totplnk_gpu.ptr(),
-            sources.get_sfc_source().ptr(), sources.get_lay_source().ptr(), sources.get_lev_source_inc().ptr(),
-            sources.get_lev_source_dec().ptr(), sources.get_sfc_source_jac().ptr());
+            sources.get_sfc_source().ptr(), sources.get_lay_source().ptr(),
+            sources.get_lev_source().ptr(), sources.get_sfc_source_jac().ptr());
 }
 
 
diff --git a/src_cuda_rt/Rte_lw_rt.cu b/src_cuda_rt/Rte_lw_rt.cu
index fbc75675..97ea6696 100644
--- a/src_cuda_rt/Rte_lw_rt.cu
+++ b/src_cuda_rt/Rte_lw_rt.cu
@@ -57,7 +57,7 @@ namespace
     }
 }
 
-//    
+//
 //    void apply_BC(
 //            int ncol, int nlay, int ngpt,
 //            Bool top_at_1, Array<Float,3>& gpt_flux_dn)
@@ -67,7 +67,7 @@ namespace
 //                &top_at_1, gpt_flux_dn.ptr());
 //    }
 //
-//    
+//
 //    void apply_BC(
 //            int ncol, int nlay, int ngpt,
 //            Bool top_at_1, const Array<Float,2>& inc_flux,
@@ -90,29 +90,32 @@ void Rte_lw_rt::rte_lw(
         const int n_gauss_angles)
 {
     const int max_gauss_pts = 4;
-    const Array<Float,2> gauss_Ds(
-            {      1.66,         0.,         0.,         0.,
-             1.18350343, 2.81649655,         0.,         0.,
-             1.09719858, 1.69338507, 4.70941630,         0.,
-             1.06056257, 1.38282560, 2.40148179, 7.15513024},
-            { max_gauss_pts, max_gauss_pts });
-
-    const Array<Float,2> gauss_wts(
-            {         0.5,           0.,           0.,           0.,
-             0.3180413817, 0.1819586183,           0.,           0.,
-             0.2009319137, 0.2292411064, 0.0698269799,           0.,
-             0.1355069134, 0.2034645680, 0.1298475476, 0.0311809710},
-            { max_gauss_pts, max_gauss_pts });
+    // Weights and angle secants for "Gauss-Jacobi-5" quadrature.
+    // Values from Table 1, R. J. Hogan 2023, doi:10.1002/qj.4598
+    const Array_gpu<Float,2> gauss_Ds(
+            Array<Float,2>(
+            { 1./0.6096748751, 0.            , 0.             , 0.,
+              1./0.2509907356, 1/0.7908473988, 0.             , 0.,
+              1./0.1024922169, 1/0.4417960320, 1./0.8633751621, 0.,
+              1./0.0454586727, 1/0.2322334416, 1./0.5740198775, 1./0.903077597 },
+            { max_gauss_pts, max_gauss_pts }));
+
+    const Array_gpu<Float,2> gauss_wts(
+            Array<Float,2>(
+            { 1.,           0.,           0.,           0.,
+              0.2300253764, 0.7699746236, 0.,           0.,
+              0.0437820218, 0.3875796738, 0.5686383044, 0.,
+              0.0092068785, 0.1285704278, 0.4323381850, 0.4298845087 },
+            { max_gauss_pts, max_gauss_pts }));
 
     const int ncol = optical_props->get_ncol();
     const int nlay = optical_props->get_nlay();
-    const int ngpt = optical_props->get_ngpt();
 
     // Upper boundary condition.
     if (inc_flux.size() == 0)
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, ngpt, top_at_1, gpt_flux_dn.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, top_at_1, gpt_flux_dn.ptr());
     else
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, ngpt, top_at_1, inc_flux.ptr(), gpt_flux_dn.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, top_at_1, inc_flux.ptr(), gpt_flux_dn.ptr());
 
     // Run the radiative transfer solver.
     const int n_quad_angs = n_gauss_angles;
@@ -127,11 +130,11 @@ void Rte_lw_rt::rte_lw(
     Array_gpu<Float,2> gpt_flux_up_jac(gpt_flux_up.get_dims());
 
     Rte_solver_kernels_cuda_rt::lw_solver_noscat_gaussquad(
-            ncol, nlay, ngpt, top_at_1, n_quad_angs,
+            ncol, nlay, top_at_1, n_quad_angs,
             gauss_Ds_subset.ptr(), gauss_wts_subset.ptr(),
             optical_props->get_tau().ptr(),
             sources.get_lay_source().ptr(),
-            sources.get_lev_source_inc().ptr(), sources.get_lev_source_dec().ptr(),
+            sources.get_lev_source().ptr(),
             sfc_emis.ptr(), sources.get_sfc_source().ptr(),
             gpt_flux_up.ptr(), gpt_flux_dn.ptr(),
             sfc_src_jac.ptr(), gpt_flux_up_jac.ptr());
diff --git a/src_cuda_rt/Rte_sw_rt.cu b/src_cuda_rt/Rte_sw_rt.cu
index bb9d4350..6c513407 100644
--- a/src_cuda_rt/Rte_sw_rt.cu
+++ b/src_cuda_rt/Rte_sw_rt.cu
@@ -59,7 +59,7 @@ namespace
 
 //namespace rrtmgp_kernel_launcher
 //{
-//    
+//
 //    void apply_BC(
 //            int ncol, int nlay, int ngpt,
 //            Bool top_at_1, Array<Float,3>& gpt_flux_dn)
@@ -69,7 +69,7 @@ namespace
 //                &top_at_1, gpt_flux_dn.ptr());
 //    }
 //
-//    
+//
 //    void apply_BC(
 //            int ncol, int nlay, int ngpt, Bool top_at_1,
 //            const Array<Float,2>& inc_flux, Array<Float,3>& gpt_flux_dn)
@@ -79,7 +79,7 @@ namespace
 //                const_cast<Float*>(inc_flux.ptr()), gpt_flux_dn.ptr());
 //    }
 //
-//    
+//
 //    void apply_BC(
 //            int ncol, int nlay, int ngpt, Bool top_at_1,
 //            const Array<Float,2>& inc_flux,
@@ -94,7 +94,7 @@ namespace
 //                gpt_flux.ptr());
 //    }
 //
-//    
+//
 //    void sw_solver_2stream(
 //            int ncol, int nlay, int ngpt, Bool top_at_1,
 //            const Array<Float,3>& tau,
@@ -130,22 +130,21 @@ void Rte_sw_rt::rte_sw(
 {
     const int ncol = optical_props->get_ncol();
     const int nlay = optical_props->get_nlay();
-    const int ngpt = optical_props->get_ngpt();
 
  //   expand_and_transpose(optical_props, sfc_alb_dir, sfc_alb_dir_gpt);
  //   expand_and_transpose(optical_props, sfc_alb_dif, sfc_alb_dif_gpt);
 
     // Upper boundary condition. At this stage, flux_dn contains the diffuse radiation only.
-    Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, ngpt, top_at_1, inc_flux_dir.ptr(), mu0.ptr(), gpt_flux_dir.ptr());
+    Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, top_at_1, inc_flux_dir.ptr(), mu0.ptr(), gpt_flux_dir.ptr());
     if (inc_flux_dif.size() == 0)
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, ngpt, top_at_1, gpt_flux_dn.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, top_at_1, gpt_flux_dn.ptr());
     else
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, ngpt, top_at_1, inc_flux_dif.ptr(), gpt_flux_dn.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, nlay, top_at_1, inc_flux_dif.ptr(), gpt_flux_dn.ptr());
 
     // Run the radiative transfer solver
     // CvH: only two-stream solutions, I skipped the sw_solver_noscat.
     Rte_solver_kernels_cuda_rt::sw_solver_2stream(
-            ncol, nlay, ngpt, top_at_1,
+            ncol, nlay, top_at_1,
             optical_props->get_tau().ptr(),
             optical_props->get_ssa().ptr(),
             optical_props->get_g  ().ptr(),
diff --git a/src_cuda_rt/Source_functions_rt.cu b/src_cuda_rt/Source_functions_rt.cu
index 12ccf8d8..e24143ab 100644
--- a/src_cuda_rt/Source_functions_rt.cu
+++ b/src_cuda_rt/Source_functions_rt.cu
@@ -35,8 +35,7 @@ Source_func_lw_rt::Source_func_lw_rt(
     sfc_source({n_col}),
     sfc_source_jac({n_col}),
     lay_source({n_col, n_lay}),
-    lev_source_inc({n_col, n_lay}),
-    lev_source_dec({n_col, n_lay})
+    lev_source({n_col, n_lay+1})
 {}
 
 
diff --git a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
index 30f9ddd5..8506712a 100644
--- a/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
+++ b/src_kernels_cuda/gas_optics_rrtmgp_kernels_launchers.cu
@@ -475,7 +475,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda
 
         dim3 grid_gpu;
         dim3 block_gpu;
-        
+
         if (tunings.count("Planck_source_kernel") == 0)
         {
             std::tie(grid_gpu, block_gpu) = tune_kernel(
@@ -495,7 +495,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda
                     delta_Tsurf, sfc_src, lay_src,
                     lev_src,
                     sfc_src_jac);
-            
+
             tunings["Planck_source_kernel"].first = grid_gpu;
             tunings["Planck_source_kernel"].second = block_gpu;
         }
diff --git a/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_launchers_rt.cu b/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_launchers_rt.cu
index 0fef9aa6..d8a3867e 100644
--- a/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_launchers_rt.cu
+++ b/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_launchers_rt.cu
@@ -201,7 +201,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
     void combine_abs_and_rayleigh(
             const int col_s, const int ncol_sub, const int ncol, const int nlay,
-            const Float* tau_abs, const Float* tau_rayleigh,
+            const Float* tau_rayleigh,
             Float* tau, Float* ssa, Float* g)
     {
         Tuner_map& tunings = Tuner::get_map();
@@ -213,17 +213,21 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
         if (tunings.count("combine_abs_and_rayleigh_kernel_rt") == 0)
         {
+            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol);
             std::tie(grid, block) = tune_kernel(
                 "combine_abs_and_rayleigh_kernel_rt",
                 dim3(ncol_sub, nlay, 1),
                 {24, 32, 48, 64, 96, 128, 256, 512}, {1, 2, 4}, {1},
                 combine_abs_and_rayleigh_kernel,
                 col_s, ncol_sub, ncol, nlay, tmin,
-                tau_abs, tau_rayleigh,
-                tau, ssa, g);
+                tau_rayleigh,
+                tau_tmp, ssa, g);
+
+            Tools_gpu::free_gpu<Float>(tau_tmp);
 
             tunings["combine_abs_and_rayleigh_kernel_rt"].first = grid;
             tunings["combine_abs_and_rayleigh_kernel_rt"].second = block;
+
         }
         else
         {
@@ -234,13 +238,13 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
         combine_abs_and_rayleigh_kernel<<<grid, block>>>(
                 col_s, ncol_sub, ncol, nlay, tmin,
-                tau_abs, tau_rayleigh,
+                tau_rayleigh,
                 tau, ssa, g);
     }
 
 
     void compute_tau_rayleigh(
-            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nbnd, const int ngpt, 
+            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nbnd, const int ngpt,
             const int igpt, const int ngas, const int nflav, const int neta, const int npres, const int ntemp,
             const int* gpoint_bands,
             const int* band_lims_gpt,
@@ -296,7 +300,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
 
     void compute_tau_absorption(
-            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband, const int ngpt, 
+            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband, const int ngpt,
             const int igpt, const int ngas, const int nflav, const int neta, const int npres, const int ntemp,
             const int nminorlower, const int nminorklower,
             const int nminorupper, const int nminorkupper,
@@ -334,7 +338,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
         if (tunings.count("gas_optical_depths_major_kernel_rt") == 0)
         {
-            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol_sub);
+            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol);
 
             std::tie(grid_maj, block_maj) = tune_kernel(
                     "gas_optical_depths_major_kernel_rt",
@@ -378,7 +382,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
         dim3 grid_min_1(nlay, ncol_sub, 1), block_min_1;
         if (tunings.count("gas_optical_depths_minor_kernel_lower_rt") == 0)
         {
-            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol_sub);
+            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol);
 
             std::tie(grid_min_1, block_min_1) = tune_kernel(
                         "gas_optical_depths_minor_kernel_lower_rt",
@@ -412,7 +416,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
         {
             block_min_1 = tunings["gas_optical_depths_minor_kernel_lower_rt"].second;
         }
-        
+
         grid_min_1 = calc_grid_size(block_min_1, dim3(nlay, ncol_sub, 1));
 
         gas_optical_depths_minor_kernel<<<grid_min_1, block_min_1>>>(
@@ -440,7 +444,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
         dim3 grid_min_2(nlay, ncol_sub, 1), block_min_2;
         if (tunings.count("gas_optical_depths_minor_kernel_upper_rt") == 0)
         {
-            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol_sub);
+            Float* tau_tmp = Tools_gpu::allocate_gpu<Float>(nlay*ncol);
             std::tie(grid_min_2, block_min_2) = tune_kernel(
                    "gas_optical_depths_minor_kernel_upper_rt",
                    dim3(nlay, ncol_sub, 1),
@@ -473,7 +477,7 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
         {
             block_min_2 = tunings["gas_optical_depths_minor_kernel_upper_rt"].second;
         }
-        
+
         grid_min_2 = calc_grid_size(block_min_2, dim3(nlay, ncol_sub, 1));
 
         gas_optical_depths_minor_kernel<<<grid_min_2, block_min_2>>>(
@@ -499,9 +503,9 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
 
 
 
-    void Planck_source(
-            const int ncol, const int nlay, const int nbnd, const int ngpt, const int igpt,
-            const int nflav, const int neta, const int npres, const int ntemp,
+    void compute_planck_source(
+            const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nbnd, const int ngpt,
+            const int igpt, const int nflav, const int neta, const int npres, const int ntemp,
             const int nPlanckTemp,
             const Float* tlay,
             const Float* tlev,
@@ -519,32 +523,30 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
             const Float* totplnk,
             Float* sfc_src,
             Float* lay_src,
-            Float* lev_src_inc,
-            Float* lev_src_dec,
+            Float* lev_src,
             Float* sfc_src_jac)
     {
         Tuner_map& tunings = Tuner::get_map();
 
         const Float delta_Tsurf = Float(1.);
 
-        dim3 grid(ncol, nlay, 1), block;
+        dim3 grid(ncol_sub, nlay, 1), block;
         if (tunings.count("Planck_source_kernel_rt") == 0)
         {
             std::tie(grid, block) = tune_kernel(
                     "Planck_source_kernel_rt",
-                    dim3(ncol, nlay, 1),
+                    dim3(ncol_sub, nlay, 1),
                     {16, 32, 48, 64, 96, 128, 256, 512}, {1, 2, 4, 8}, {1},
-                    Planck_source_kernel,
-                    ncol, nlay, nbnd, ngpt,
+                    planck_source_kernel,
+                    col_s, ncol_sub, ncol, nlay, nbnd, ngpt,
                     nflav, neta, npres, ntemp, nPlanckTemp, igpt-1,
-                    tlay, tlev, tsfc, sfc_lay,
+                    tlay, tlev, tsfc, sfc_lay-1,
                     fmajor, jeta, tropo, jtemp,
                     jpress, gpoint_bands, band_lims_gpt,
                     pfracin, temp_ref_min, totplnk_delta,
                     totplnk,
                     delta_Tsurf, sfc_src, lay_src,
-                    lev_src_inc, lev_src_dec,
-                    sfc_src_jac);
+                    lev_src, sfc_src_jac);
 
             tunings["Planck_source_kernel_rt"].first = grid;
             tunings["Planck_source_kernel_rt"].second = block;
@@ -553,21 +555,21 @@ namespace Gas_optics_rrtmgp_kernels_cuda_rt
         {
             block = tunings["Planck_source_kernel_rt"].second;
         }
-        
-        grid = calc_grid_size(block, dim3(ncol, nlay, 1));
 
-        Planck_source_kernel<<<grid, block>>>(
-                ncol, nlay, nbnd, ngpt,
+        grid = calc_grid_size(block, dim3(ncol_sub, nlay, 1));
+
+        planck_source_kernel<<<grid, block>>>(
+                col_s, ncol_sub, ncol, nlay, nbnd, ngpt,
                 nflav, neta, npres, ntemp, nPlanckTemp, igpt-1,
-                tlay, tlev, tsfc, sfc_lay,
+                tlay, tlev, tsfc, sfc_lay-1,
                 fmajor, jeta, tropo, jtemp,
                 jpress, gpoint_bands, band_lims_gpt,
                 pfracin, temp_ref_min, totplnk_delta,
                 totplnk,
                 delta_Tsurf,
                 sfc_src, lay_src,
-                lev_src_inc, lev_src_dec,
-                sfc_src_jac);
+                lev_src, sfc_src_jac);
+
     }
 }
 
diff --git a/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_rt.cu b/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_rt.cu
index 7104d4eb..785c73e2 100644
--- a/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_rt.cu
+++ b/src_kernels_cuda_rt/gas_optics_rrtmgp_kernels_rt.cu
@@ -172,9 +172,9 @@ void zero_array_kernel(
 }
 
 __global__
-void Planck_source_kernel(
-        const int ncol, const int nlay, const int nband, const int ngpt,
-        const int nflav, const int neta, const int npres, const int ntemp,
+void planck_source_kernel(
+        const int col_s, const int ncol_sub, const int ncol, const int nlay, const int nband,
+        const int ngpt,const int nflav, const int neta, const int npres, const int ntemp,
         const int nPlanckTemp, const int igpt,
         const Float* __restrict__ tlay, const Float* __restrict__ tlev,
         const Float* __restrict__ tsfc,
@@ -186,21 +186,23 @@ void Planck_source_kernel(
         const Float temp_ref_min, const Float totplnk_delta,
         const Float* __restrict__ totplnk,
         const Float delta_Tsurf,
-        Float* __restrict__ sfc_src, Float* __restrict__ lay_src,
-        Float* __restrict__ lev_src_inc, Float* __restrict__ lev_src_dec,
+        Float* __restrict__ sfc_src,
+        Float* __restrict__ lay_src,
+        Float* __restrict__ lev_src,
         Float* __restrict__ sfc_src_jac)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
     const int ilay = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if ( (icol < ncol) && (ilay < nlay))
+    if ( (icol < ncol_sub) && (ilay < nlay))
     {
         const int ibnd = gpoint_bands[igpt]-1;
-        const int idx_collay = icol + ilay * ncol;
+        const int idx_collay = icol + ilay * ncol_sub;
+
         const int itropo = !tropo[idx_collay];
 
-        const int idx_fcl3 = 2 * 2 * 2 * (icol + ilay*ncol);
-        const int idx_fcl1 = 2 *         (icol + ilay*ncol);
+        const int idx_fcl3 = 2 * 2 * 2 * (icol + ilay*ncol_sub);
+        const int idx_fcl1 = 2 *         (icol + ilay*ncol_sub);
 
         const int j0 = jeta[idx_fcl1+0];
         const int j1 = jeta[idx_fcl1+1];
@@ -208,17 +210,10 @@ void Planck_source_kernel(
         const int jpress_idx = jpress[idx_collay]+itropo;
 
         // compute layer source irradiances.
-        const int idx_tmp = icol + ilay*ncol;
-        const Float planck_function_lay = interpolate1D(tlay[idx_tmp], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
+        const int idx_tmp = icol + col_s + ilay*ncol;
+        Float planck_function_1 = interpolate1D(tlay[idx_tmp], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
 
-        // compute level source irradiances.
-        const int idx_tmp1 = icol + (ilay+1)*ncol;
-        const int idx_tmp2 = icol + ilay*ncol;
-        const Float planck_function_lev1 = interpolate1D(tlev[idx_tmp1], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
-        const Float planck_function_lev2 = interpolate1D(tlev[idx_tmp2], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
-
-        const int idx = icol + ilay*ncol;
-        const int idx_sfc = icol;
+        const int idx = icol + col_s + ilay*ncol;
 
         const Float pfrac_loc =
               (fmajor[idx_fcl3+0] * pfracin[(jtemp_idx-1) + (j0-1)*ntemp + (jpress_idx-1)*ntemp*neta + igpt*ntemp*neta*(npres+1)] +
@@ -232,23 +227,60 @@ void Planck_source_kernel(
                fmajor[idx_fcl3+7] * pfracin[jtemp_idx +  j1   *ntemp + jpress_idx    *ntemp*neta + igpt*ntemp*neta*(npres+1)]);
 
         // Layer source
-        lay_src[idx] = pfrac_loc * planck_function_lay;
+        lay_src[idx] = pfrac_loc * planck_function_1;
 
-        // Level source
-        lev_src_inc[idx] = pfrac_loc * planck_function_lev1;
-        lev_src_dec[idx] = pfrac_loc * planck_function_lev2;
+        // level source irradiances.
+        planck_function_1 = interpolate1D(tlev[idx_tmp], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
 
-        // Surface
-        if (ilay == sfc_lay - 1) // Subtract one to correct for fortran indexing.
+        if (ilay == 0)
         {
-            const Float planck_function_sfc1 = interpolate1D(
-                    tsfc[icol], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
-            const Float planck_function_sfc2 = interpolate1D(
-                    tsfc[icol] + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
+            lev_src[idx] = pfrac_loc * planck_function_1;
+        }
+        else
+        {
+            const int idx_collay_m1 = icol + (ilay-1) * ncol_sub;
+            const int itropo_m1 = !tropo[idx_collay_m1];
+
+            const int idx_fcl3_m1 = 2 * 2 * 2 * (icol + (ilay-1)*ncol_sub);
+            const int idx_fcl1_m1 = 2 *         (icol + (ilay-1)*ncol_sub);
 
-            sfc_src[idx_sfc] = pfrac_loc * planck_function_sfc1;
-            sfc_src_jac[idx_sfc] = pfrac_loc * (planck_function_sfc2 - planck_function_sfc1);
+            const int j0_m1 = jeta[idx_fcl1_m1+0];
+            const int j1_m1 = jeta[idx_fcl1_m1+1];
+            const int jtemp_idx_m1 = jtemp[idx_collay_m1];
+            const int jpress_idx_m1 = jpress[idx_collay_m1]+itropo_m1;
+
+            const Float pfrac_m1 =
+                  (fmajor[idx_fcl3_m1+0] * pfracin[(jtemp_idx_m1-1) + (j0_m1-1)*ntemp + (jpress_idx_m1-1)*ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+1] * pfracin[(jtemp_idx_m1-1) +  j0_m1   *ntemp + (jpress_idx_m1-1)*ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+2] * pfracin[(jtemp_idx_m1-1) + (j0_m1-1)*ntemp + jpress_idx_m1    *ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+3] * pfracin[(jtemp_idx_m1-1) +  j0_m1   *ntemp + jpress_idx_m1    *ntemp*neta + igpt*ntemp*neta*(npres+1)])
+
+                + (fmajor[idx_fcl3_m1+4] * pfracin[jtemp_idx_m1 + (j1_m1-1)*ntemp + (jpress_idx_m1-1)*ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+5] * pfracin[jtemp_idx_m1 +  j1_m1   *ntemp + (jpress_idx_m1-1)*ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+6] * pfracin[jtemp_idx_m1 + (j1_m1-1)*ntemp + jpress_idx_m1    *ntemp*neta + igpt*ntemp*neta*(npres+1)] +
+                   fmajor[idx_fcl3_m1+7] * pfracin[jtemp_idx_m1 +  j1_m1   *ntemp + jpress_idx_m1    *ntemp*neta + igpt*ntemp*neta*(npres+1)]);
+
+            lev_src[idx] = sqrt(pfrac_loc * pfrac_m1) * planck_function_1;
+        }
+
+        if (ilay ==  (nlay-1))
+        {
+            const int idx_tmp_p1 = idx_tmp + ncol;
+            planck_function_1 = interpolate1D(tlev[idx_tmp_p1], temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
+            lev_src[idx_tmp_p1] = pfrac_loc * planck_function_1;
         }
+
+        if (ilay == sfc_lay)
+        {
+            const int idx_sfc = icol + col_s;
+                        planck_function_1 = interpolate1D(tsfc[idx_sfc],               temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
+            const Float planck_function_2 = interpolate1D(tsfc[idx_sfc] + delta_Tsurf, temp_ref_min, totplnk_delta, nPlanckTemp, &totplnk[ibnd * nPlanckTemp]);
+
+            sfc_src[idx_sfc] = pfrac_loc * planck_function_1;
+            sfc_src_jac[idx_sfc] = pfrac_loc * (planck_function_2 - planck_function_1);
+
+        }
+
     }
 }
 
@@ -256,8 +288,8 @@ void Planck_source_kernel(
 __global__
 void interpolation_kernel(
         const int igpt,
-        const int col_s, const int ncol_sub, const int ncol, const int nlay, const int ngas, 
-        const int nflav, const int neta, const int npres, const int ntemp, 
+        const int col_s, const int ncol_sub, const int ncol, const int nlay, const int ngas,
+        const int nflav, const int neta, const int npres, const int ntemp,
         const Float tmin,
         const int* __restrict__ gpoint_flavor,
         const int* __restrict__ flavor,
@@ -293,7 +325,7 @@ void interpolation_kernel(
 
         jpress[idx] = min(npres-1, max(1, int(locpress)));
         tropo[idx] = log(play[idx_off]) > press_ref_trop_log;
-        
+
         const int itropo = !tropo[idx];
         const int iflav = gpoint_flavor[itropo + 2*igpt] - 1;
 
@@ -305,7 +337,7 @@ void interpolation_kernel(
 
         const Float gas1 = col_gas[idx + gas_idx1*nlay*ncol_sub];
         const Float gas2 = col_gas[idx + gas_idx2*nlay*ncol_sub];
-        
+
         for (int itemp=0; itemp<2; ++itemp)
         {
             const int vmr_base_idx = itropo + (jtemp[idx]+itemp-1) * (ngas+1) * 2;
@@ -356,6 +388,7 @@ void gas_optical_depths_major_kernel(
     if ( (icol < ncol_sub) && (ilay < nlay) )
     {
         const int idx_collay = icol + ilay*ncol_sub;
+        const int idx_off = icol + col_s + ilay*ncol;
         const int itropo = !tropo[idx_collay];
 
         const int ljtemp = jtemp[idx_collay];
@@ -372,7 +405,7 @@ void gas_optical_depths_major_kernel(
         #pragma unroll 1
         for (int i=0; i<2; ++i)
         {
-            tau[idx_collay] += col_mix[idx_fcl1+i] *
+            tau[idx_off] += col_mix[idx_fcl1+i] *
                 (ifmajor[i*4+0] * kmajor[(ljtemp-1+i) + (jeta[idx_fcl1+i]-1)*ntemp + (jpressi-1)*ntemp*neta + igpt*ntemp*neta*npress] +
                  ifmajor[i*4+1] * kmajor[(ljtemp-1+i) +  jeta[idx_fcl1+i]   *ntemp + (jpressi-1)*ntemp*neta + igpt*ntemp*neta*npress] +
                  ifmajor[i*4+2] * kmajor[(ljtemp-1+i) + (jeta[idx_fcl1+i]-1)*ntemp + jpressi    *ntemp*neta + igpt*ntemp*neta*npress] +
@@ -414,7 +447,7 @@ void gas_optical_depths_minor_kernel(
         const int ncollay = ncol_sub * nlay;
         const int idx_collay = icol + ilay*ncol_sub;
         const int idx_off = icol + col_s + ilay*ncol;
-        
+
         const int minor_start = first_last_minor[2*igpt];
         const int minor_end   = first_last_minor[2*igpt+1];
 
@@ -460,7 +493,7 @@ void gas_optical_depths_minor_kernel(
                                    kfminor[2] * kin[kjtemp     + (j1-1)*ntemp + (igpt-start_of_band+gpt_offset)*ntemp*neta] +
                                    kfminor[3] * kin[kjtemp     +  j1   *ntemp + (igpt-start_of_band+gpt_offset)*ntemp*neta];
 
-                tau[idx_collay] += ltau_minor * scaling;
+                tau[idx_off] += ltau_minor * scaling;
             }
         }
     }
@@ -512,7 +545,7 @@ void compute_tau_rayleigh_kernel(
 __global__
 void combine_abs_and_rayleigh_kernel(
         const int col_s, const int ncol_sub, const int ncol, const int nlay, const Float tmin,
-        const Float* __restrict__ tau_abs, const Float* __restrict__ tau_rayleigh,
+        const Float* __restrict__ tau_rayleigh,
         Float* __restrict__ tau, Float* __restrict__ ssa, Float* __restrict__ g)
 {
     // Fetch the three coordinates.
@@ -521,17 +554,17 @@ void combine_abs_and_rayleigh_kernel(
 
     if ( (icol < ncol_sub) && (ilay < nlay) )
     {
-        const int idx = icol + ilay*ncol_sub;
-        const int idx_out = icol + col_s + ilay*ncol;
+        const int idx_sub = icol + ilay*ncol_sub;
+        const int idx_full = icol + col_s + ilay*ncol;
 
-        const Float tau_tot = tau_abs[idx] + tau_rayleigh[idx];
+        const Float tau_tot = tau[idx_full] + tau_rayleigh[idx_sub];
 
-        tau[idx_out] = tau_tot;
-        g  [idx_out] = Float(0.);
+        tau[idx_full] = tau_tot;
+        g  [idx_full] = Float(0.);
 
         if (tau_tot>(Float(2.)*tmin))
-            ssa[idx_out] = tau_rayleigh[idx]/tau_tot;
+            ssa[idx_full] = tau_rayleigh[idx_sub]/tau_tot;
         else
-            ssa[idx_out] = Float(0.);
+            ssa[idx_full] = Float(0.);
     }
 }
diff --git a/src_kernels_cuda_rt/raytracer_kernels_bw.cu b/src_kernels_cuda_rt/raytracer_kernels_bw.cu
index d7736c99..4cb2eb36 100644
--- a/src_kernels_cuda_rt/raytracer_kernels_bw.cu
+++ b/src_kernels_cuda_rt/raytracer_kernels_bw.cu
@@ -280,7 +280,8 @@ namespace
             const Vector<Float>& normal,
             const Phase_kind kind)
     {
-        const Float cos_angle = dot(photon.direction, sun_direction);
+        const Float cos_angle = min(max(Float(-1.), dot(photon.direction, sun_direction)), Float(1.));
+
         if (kind == Phase_kind::HG)
         {
             return henyey_phase(g, cos_angle) * sun_solid_angle;
diff --git a/src_kernels_cuda_rt/rte_solver_kernels_launchers_rt.cu b/src_kernels_cuda_rt/rte_solver_kernels_launchers_rt.cu
index e8238947..1fbfba90 100644
--- a/src_kernels_cuda_rt/rte_solver_kernels_launchers_rt.cu
+++ b/src_kernels_cuda_rt/rte_solver_kernels_launchers_rt.cu
@@ -11,14 +11,14 @@
 namespace
 {
     #include "rte_solver_kernels_rt.cu"
-    
+
     using Tools_gpu::calc_grid_size;
 }
 
 
 namespace Rte_solver_kernels_cuda_rt
 {
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1,
                   const Float* inc_flux_dir, const Float* mu0, Float* gpt_flux_dir)
     {
         const int block_col = 32;
@@ -26,56 +26,56 @@ namespace Rte_solver_kernels_cuda_rt
 
         dim3 grid_gpu(grid_col);
         dim3 block_gpu(block_col);
-        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, ngpt, top_at_1, inc_flux_dir, mu0, gpt_flux_dir);
+        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, top_at_1, inc_flux_dir, mu0, gpt_flux_dir);
     }
 
 
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, Float* gpt_flux_dn)
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, Float* gpt_flux_dn)
     {
         const int block_col = 32;
         const int grid_col = ncol/block_col + (ncol%block_col > 0);
 
         dim3 grid_gpu(grid_col);
         dim3 block_gpu(block_col);
-        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, ngpt, top_at_1, gpt_flux_dn);
+        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, top_at_1, gpt_flux_dn);
     }
 
 
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float inc_flux, Float* gpt_flux_dn)
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, const Float inc_flux, Float* gpt_flux_dn)
     {
         const int block_col = 32;
         const int grid_col = ncol/block_col + (ncol%block_col > 0);
 
         dim3 grid_gpu(grid_col);
         dim3 block_gpu(block_col);
-        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, ngpt, top_at_1, inc_flux, gpt_flux_dn);
+        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, top_at_1, inc_flux, gpt_flux_dn);
     }
-    
-    void apply_BC(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float* inc_flux_dif, Float* gpt_flux_dn)
+
+    void apply_BC(const int ncol, const int nlay, const Bool top_at_1, const Float* inc_flux_dif, Float* gpt_flux_dn)
     {
         const int block_col = 32;
         const int grid_col = ncol/block_col + (ncol%block_col > 0);
 
         dim3 grid_gpu(grid_col);
         dim3 block_gpu(block_col);
-        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, ngpt, top_at_1, inc_flux_dif, gpt_flux_dn);
+        apply_BC_kernel<<<grid_gpu, block_gpu>>>(ncol, nlay, top_at_1, inc_flux_dif, gpt_flux_dn);
     }
 
     void lw_solver_noscat_gaussquad(
-            const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const int nmus,
+            const int ncol, const int nlay, const Bool top_at_1, const int nmus,
             const Float* ds, const Float* weights, const Float* tau, const Float* lay_source,
-            const Float* lev_source_inc, const Float* lev_source_dec, const Float* sfc_emis,
+            const Float* lev_source, const Float* sfc_emis,
             const Float* sfc_src, Float* flux_up, Float* flux_dn,
             const Float* sfc_src_jac, Float* flux_up_jac)
     {
         Tuner_map& tunings = Tuner::get_map();
-        
+
         Float eps = std::numeric_limits<Float>::epsilon();
 
         const int flx_size = ncol*(nlay+1);
         const int opt_size = ncol*nlay;
         const int sfc_size = ncol;
-    
+
         Float* source_sfc = Tools_gpu::allocate_gpu<Float>(sfc_size);
         Float* source_sfc_jac = Tools_gpu::allocate_gpu<Float>(sfc_size);
         Float* sfc_albedo = Tools_gpu::allocate_gpu<Float>(sfc_size);
@@ -103,14 +103,14 @@ namespace Rte_solver_kernels_cuda_rt
             Float* flux_up_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
             Float* flux_dn_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
             Float* flux_up_jac_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
- 
+
             std::tie(grid_1, block_1) = tune_kernel(
                     "lw_step_1_rt",
-                    dim3(ncol, nlay, 1), 
+                    dim3(ncol, nlay, 1),
                     {8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024}, {1}, {1},
                     lw_solver_noscat_step_1_kernel,
-                    ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                    lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
+                    ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                    lev_source, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
                     flux_up_jac_tmp, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
             Tools_gpu::free_gpu<Float>(flux_up_tmp);
@@ -119,7 +119,7 @@ namespace Rte_solver_kernels_cuda_rt
 
             tunings["lw_step_1_rt"].first = grid_1;
             tunings["lw_step_1_rt"].second = block_1;
-        
+
         }
         else
         {
@@ -129,8 +129,8 @@ namespace Rte_solver_kernels_cuda_rt
         grid_1 = calc_grid_size(block_1, dim3(ncol, nlay, 1));
 
         lw_solver_noscat_step_1_kernel<<<grid_1, block_1>>>(
-                ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
+                ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                lev_source, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
                 flux_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
 
@@ -145,11 +145,11 @@ namespace Rte_solver_kernels_cuda_rt
 
             std::tie(grid_2, block_2) = tune_kernel(
                     "lw_step_2_rt",
-                    dim3(ncol, 1, 1), 
+                    dim3(ncol, 1, 1),
                     {64, 128, 256, 384, 512, 768, 1024}, {1}, {1},
                     lw_solver_noscat_step_2_kernel,
-                    ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                    lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
+                    ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                    lev_source, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
                     flux_up_jac_tmp, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
             Tools_gpu::free_gpu<Float>(flux_up_tmp);
@@ -163,12 +163,12 @@ namespace Rte_solver_kernels_cuda_rt
         {
             block_2 = tunings["lw_step_2_rt"].second;
         }
-        
+
         grid_2 = calc_grid_size(block_2, dim3(ncol, 1, 1));
 
         lw_solver_noscat_step_2_kernel<<<grid_2, block_2>>>(
-                ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
+                ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                lev_source, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
                 flux_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
 
@@ -180,20 +180,20 @@ namespace Rte_solver_kernels_cuda_rt
             Float* flux_up_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
             Float* flux_dn_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
             Float* flux_up_jac_tmp = Tools_gpu::allocate_gpu<Float>((nlay+1)*ncol);
-            
+
             std::tie(grid_3, block_3) = tune_kernel(
                     "lw_step_3_rt",
-                    dim3(ncol, nlay+1, 1), 
+                    dim3(ncol, nlay+1, 1),
                     {8, 16, 24, 32, 48, 64, 96, 128, 256}, {1, 2, 4, 8}, {1},
                     lw_solver_noscat_step_3_kernel,
-                    ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                    lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
+                    ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                    lev_source, sfc_emis, sfc_src, flux_up_tmp, flux_dn_tmp, sfc_src_jac,
                     flux_up_jac_tmp, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
             Tools_gpu::free_gpu<Float>(flux_up_tmp);
             Tools_gpu::free_gpu<Float>(flux_dn_tmp);
             Tools_gpu::free_gpu<Float>(flux_up_jac_tmp);
-            
+
             tunings["lw_step_3_rt"].first = grid_3;
             tunings["lw_step_3_rt"].second = block_3;
         }
@@ -201,15 +201,15 @@ namespace Rte_solver_kernels_cuda_rt
         {
             block_3 = tunings["lw_step_3_rt"].second;
         }
-        
+
         grid_3 = calc_grid_size(block_3, dim3(ncol, nlay+1, 1));
 
         lw_solver_noscat_step_3_kernel<<<grid_3, block_3>>>(
-                ncol, nlay, ngpt, eps, top_at_1, ds, weights, tau, lay_source,
-                lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
+                ncol, nlay, eps, top_at_1, ds, weights, tau, lay_source,
+                lev_source, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
                 flux_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
-        apply_BC_kernel_lw<<<grid_gpu1d, block_gpu1d>>>(top_level, ncol, nlay, ngpt, top_at_1, flux_dn, radn_dn);
+        apply_BC_kernel_lw<<<grid_gpu1d, block_gpu1d>>>(top_level, ncol, nlay, top_at_1, flux_dn, radn_dn);
 
         if (nmus > 1)
         {
@@ -218,28 +218,28 @@ namespace Rte_solver_kernels_cuda_rt
                 throw std::runtime_error("Not implemented due to lacking test case");
                 /*
                 lw_solver_noscat_step_1_kernel<<<grid_1, block_1>>>(
-                        ncol, nlay, ngpt, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
-                        lev_source_inc, lev_source_dec, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
+                        ncol, nlay, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
+                        lev_source, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
                         radn_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
                 lw_solver_noscat_step_2_kernel<<<grid_2, block_2>>>(
-                        ncol, nlay, ngpt, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
-                        lev_source_inc, lev_source_dec, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
+                        ncol, nlay, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
+                        lev_source, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
                         radn_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
                 lw_solver_noscat_step_3_kernel<<<grid_3, block_3>>>(
-                        ncol, nlay, ngpt, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
-                        lev_source_inc, lev_source_dec, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
+                        ncol, nlay, eps, top_at_1, ds+imu, weights+imu, tau, lay_source,
+                        lev_source, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
                         radn_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 
                 add_fluxes_kernel<<<grid_gpu2d, block_gpu2d>>>(
-                        ncol, nlay+1, ngpt,
+                        ncol, nlay+1,
                         radn_up, radn_dn, radn_up_jac,
                         flux_up, flux_dn, flux_up_jac);
                 */
             }
         }
-    
+
         Tools_gpu::free_gpu(source_sfc);
         Tools_gpu::free_gpu(source_sfc_jac);
         Tools_gpu::free_gpu(sfc_albedo);
@@ -253,7 +253,7 @@ namespace Rte_solver_kernels_cuda_rt
     }
 
 
-    void sw_solver_2stream(const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+    void sw_solver_2stream(const int ncol, const int nlay, const Bool top_at_1,
                            const Float* tau, const Float* ssa, const Float* g,
                            const Float* mu0, const Float* sfc_alb_dir, const Float* sfc_alb_dif,
                            Float* flux_up, Float* flux_dn, Float* flux_dir)
@@ -283,20 +283,20 @@ namespace Rte_solver_kernels_cuda_rt
             {
                 std::tie(grid_source, block_source) = tune_kernel(
                         "sw_source_2stream_kernel_rt",
-                        dim3(ncol, 1), 
+                        dim3(ncol, 1),
                         {32, 64, 96, 128, 256, 384, 512, 768, 1024}, {1}, {1},
                         sw_source_2stream_kernel<1>,
-                        ncol, nlay, ngpt, tau, ssa, g, mu0, r_dif, t_dif,
+                        ncol, nlay, tau, ssa, g, mu0, r_dif, t_dif,
                         sfc_alb_dir, source_up, source_dn, source_sfc, flux_dir);
             }
             else
             {
                 std::tie(grid_source, block_source) = tune_kernel(
                         "sw_source_2stream_kernel_rt",
-                        dim3(ncol, 1), 
+                        dim3(ncol, 1),
                         {32, 64, 96, 128, 256, 384, 512, 768, 1024}, {1}, {1},
                         sw_source_2stream_kernel<0>,
-                        ncol, nlay, ngpt, tau, ssa, g, mu0, r_dif, t_dif,
+                        ncol, nlay, tau, ssa, g, mu0, r_dif, t_dif,
                         sfc_alb_dir, source_up, source_dn, source_sfc, flux_dir);
             }
 
@@ -307,19 +307,19 @@ namespace Rte_solver_kernels_cuda_rt
         {
             block_source = tunings["sw_source_2stream_kernel_rt"].second;
         }
-            
+
         grid_source = calc_grid_size(block_source, dim3(ncol, 1));
-        
+
         if (top_at_1)
         {
             sw_source_2stream_kernel<1><<<grid_source, block_source>>>(
-                    ncol, nlay, ngpt, tau, ssa, g, mu0, r_dif, t_dif,
+                    ncol, nlay, tau, ssa, g, mu0, r_dif, t_dif,
                     sfc_alb_dir, source_up, source_dn, source_sfc, flux_dir);
         }
         else
         {
             sw_source_2stream_kernel<0><<<grid_source, block_source>>>(
-                    ncol, nlay, ngpt, tau, ssa, g, mu0, r_dif, t_dif,
+                    ncol, nlay, tau, ssa, g, mu0, r_dif, t_dif,
                     sfc_alb_dir, source_up, source_dn, source_sfc, flux_dir);
         }
 
@@ -340,7 +340,7 @@ namespace Rte_solver_kernels_cuda_rt
                         dim3(ncol, 1),
                         {16, 32, 64, 96, 128, 256, 384, 512}, {1}, {1},
                         sw_adding_kernel<1>,
-                        ncol, nlay, ngpt, top_at_1,
+                        ncol, nlay, top_at_1,
                         sfc_alb_dif, r_dif, t_dif,
                         source_dn, source_up, source_sfc,
                         flux_up_tmp, flux_dn_tmp, flux_dir_tmp, albedo, src, denom);
@@ -349,15 +349,15 @@ namespace Rte_solver_kernels_cuda_rt
             {
                 std::tie(grid_adding, block_adding) = tune_kernel(
                         "sw_adding_rt",
-                        dim3(ncol, 1), 
+                        dim3(ncol, 1),
                         {16, 32, 64, 96, 128, 256, 384, 512}, {1}, {1},
                         sw_adding_kernel<0>,
-                        ncol, nlay, ngpt, top_at_1,
+                        ncol, nlay, top_at_1,
                         sfc_alb_dif, r_dif, t_dif,
                         source_dn, source_up, source_sfc,
                         flux_up_tmp, flux_dn_tmp, flux_dir_tmp, albedo, src, denom);
             }
-            
+
             Tools_gpu::free_gpu<Float>(flux_up_tmp);
             Tools_gpu::free_gpu<Float>(flux_dn_tmp);
             Tools_gpu::free_gpu<Float>(flux_dir_tmp);
@@ -368,13 +368,13 @@ namespace Rte_solver_kernels_cuda_rt
         {
             block_adding = tunings["sw_adding_rt"].second;
         }
-        
+
         grid_adding = calc_grid_size(block_adding, dim3(ncol, 1));
 
         if (top_at_1)
         {
             sw_adding_kernel<1><<<grid_adding, block_adding>>>(
-                ncol, nlay, ngpt, top_at_1,
+                ncol, nlay, top_at_1,
                 sfc_alb_dif, r_dif, t_dif,
                 source_dn, source_up, source_sfc,
                 flux_up, flux_dn, flux_dir, albedo, src, denom);
@@ -382,7 +382,7 @@ namespace Rte_solver_kernels_cuda_rt
         else
         {
             sw_adding_kernel<0><<<grid_adding, block_adding>>>(
-                        ncol, nlay, ngpt, top_at_1,
+                        ncol, nlay, top_at_1,
                         sfc_alb_dif, r_dif, t_dif,
                         source_dn, source_up, source_sfc,
                         flux_up, flux_dn, flux_dir, albedo, src, denom);
diff --git a/src_kernels_cuda_rt/rte_solver_kernels_rt.cu b/src_kernels_cuda_rt/rte_solver_kernels_rt.cu
index 9ee6a66c..c7b94503 100644
--- a/src_kernels_cuda_rt/rte_solver_kernels_rt.cu
+++ b/src_kernels_cuda_rt/rte_solver_kernels_rt.cu
@@ -11,7 +11,7 @@ template<> __device__ constexpr float k_min() { return 1.e-4f; }
 
 __device__
 void lw_transport_noscat_kernel(
-        const int icol, const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+        const int icol, const int ncol, const int nlay, const Bool top_at_1,
         const Float* __restrict__ tau, const Float* __restrict__ trans, const Float* __restrict__ sfc_albedo,
         const Float* __restrict__ source_dn, const Float* __restrict__ source_up, const Float* __restrict__ source_sfc,
         Float* __restrict__ radn_up, Float* __restrict__ radn_dn, const Float* __restrict__ source_sfc_jac, Float* __restrict__ radn_up_jac)
@@ -73,9 +73,9 @@ void lw_transport_noscat_kernel(
 
 __global__
 void lw_solver_noscat_step_1_kernel(
-        const int ncol, const int nlay, const int ngpt, const Float eps, const Bool top_at_1,
+        const int ncol, const int nlay, const Float eps, const Bool top_at_1,
         const Float* __restrict__ D, const Float* __restrict__ weight, const Float* __restrict__ tau, const Float* __restrict__ lay_source,
-        const Float* __restrict__ lev_source_inc, const Float* __restrict__ lev_source_dec, const Float* __restrict__ sfc_emis,
+        const Float* __restrict__ lev_source, const Float* __restrict__ sfc_emis,
         const Float* __restrict__ sfc_src, Float* __restrict__ radn_up, Float* __restrict__ radn_dn,
         const Float* __restrict__ sfc_src_jac, Float* __restrict__ radn_up_jac, Float* __restrict__ tau_loc,
         Float* __restrict__ trans, Float* __restrict__ source_dn, Float* __restrict__ source_up,
@@ -87,13 +87,15 @@ void lw_solver_noscat_step_1_kernel(
     if ( (icol < ncol) && (ilay < nlay) )
     {
         const int idx = icol + ilay*ncol;
+        const int idx_p = icol + (ilay+1)*ncol;
+
         tau_loc[idx] = tau[idx] * D[0];
         trans[idx] = exp(-tau_loc[idx]);
 
         const Float fact = (tau_loc[idx]>0. && (tau_loc[idx]*tau_loc[idx])>eps) ? (Float(1.) - trans[idx]) / tau_loc[idx] - trans[idx] : tau_loc[idx] * (Float(.5) - Float(1.)/Float(3.)*tau_loc[idx]);
 
-        Float src_inc = (Float(1.) - trans[idx]) * lev_source_inc[idx] + Float(2.) * fact * (lay_source[idx]-lev_source_inc[idx]);
-        Float src_dec = (Float(1.) - trans[idx]) * lev_source_dec[idx] + Float(2.) * fact * (lay_source[idx]-lev_source_dec[idx]);
+        Float src_inc = (Float(1.) - trans[idx]) * lev_source[idx_p] + Float(2.) * fact * (lay_source[idx]-lev_source[idx_p]);
+        Float src_dec = (Float(1.) - trans[idx]) * lev_source[idx] + Float(2.) * fact * (lay_source[idx]-lev_source[idx]);
 
         source_dn[idx] = top_at_1 ? src_inc : src_dec;
         source_up[idx] = top_at_1 ? src_dec : src_inc;
@@ -103,9 +105,9 @@ void lw_solver_noscat_step_1_kernel(
 
 __global__
 void lw_solver_noscat_step_2_kernel(
-        const int ncol, const int nlay, const int ngpt, const Float eps, const Bool top_at_1,
+        const int ncol, const int nlay, const Float eps, const Bool top_at_1,
         const Float* __restrict__ D, const Float* __restrict__ weight, const Float* __restrict__ tau, const Float* __restrict__ lay_source,
-        const Float* __restrict__ lev_source_inc, const Float* __restrict__ lev_source_dec, const Float* __restrict__ sfc_emis,
+        const Float* __restrict__ lev_source, const Float* __restrict__ sfc_emis,
         const Float* __restrict__ sfc_src, Float* __restrict__ radn_up, Float* __restrict__ radn_dn,
         const Float* __restrict__ sfc_src_jac, Float* __restrict__ radn_up_jac, Float* __restrict__ tau_loc,
         Float* __restrict__ trans, Float* __restrict__ source_dn, Float* __restrict__ source_up,
@@ -126,7 +128,7 @@ void lw_solver_noscat_step_2_kernel(
 
 
         lw_transport_noscat_kernel(
-                icol, ncol, nlay, ngpt, top_at_1, tau, trans, sfc_albedo, source_dn,
+                icol, ncol, nlay, top_at_1, tau, trans, sfc_albedo, source_dn,
                 source_up, source_sfc, radn_up, radn_dn, source_sfc_jac, radn_up_jac);
     }
 }
@@ -134,9 +136,9 @@ void lw_solver_noscat_step_2_kernel(
 
 __global__
 void lw_solver_noscat_step_3_kernel(
-        const int ncol, const int nlay, const int ngpt, const Float eps, const Bool top_at_1,
+        const int ncol, const int nlay, const Float eps, const Bool top_at_1,
         const Float* __restrict__ D, const Float* __restrict__ weight, const Float* __restrict__ tau, const Float* __restrict__ lay_source,
-        const Float* __restrict__ lev_source_inc, const Float* __restrict__ lev_source_dec, const Float* __restrict__ sfc_emis,
+        const Float* __restrict__ lev_source, const Float* __restrict__ sfc_emis,
         const Float* __restrict__ sfc_src, Float* __restrict__ radn_up, Float* __restrict__ radn_dn,
         const Float* __restrict__ sfc_src_jac, Float* __restrict__ radn_up_jac, Float* __restrict__ tau_loc,
         Float* __restrict__ trans, Float* __restrict__ source_dn, Float* __restrict__ source_up,
@@ -150,15 +152,15 @@ void lw_solver_noscat_step_3_kernel(
         const Float pi = acos(Float(-1.));
 
         const int idx = icol + ilev*ncol;
-        radn_up[idx] *= Float(2.) * pi * weight[0];
-        radn_dn[idx] *= Float(2.) * pi * weight[0];
-        radn_up_jac[idx] *= Float(2.) * pi * weight[0];
+        radn_up[idx] *= pi * weight[0];
+        radn_dn[idx] *= pi * weight[0];
+        radn_up_jac[idx] *= pi * weight[0];
     }
 }
 
 template<Bool top_at_1> __global__
 void sw_adding_kernel(
-        const int ncol, const int nlay, const int ngpt, const Bool _top_at_1,
+        const int ncol, const int nlay, const Bool _top_at_1,
         const Float* __restrict__ sfc_alb_dif, const Float* __restrict__ r_dif, const Float* __restrict__ t_dif,
         const Float* __restrict__ source_dn, const Float* __restrict__ source_up, const Float* __restrict__ source_sfc,
         Float* __restrict__ flux_up, Float* __restrict__ flux_dn, const Float* __restrict__ flux_dir,
@@ -248,7 +250,7 @@ void sw_adding_kernel(
 
 template<Bool top_at_1> __global__
 void sw_source_kernel(
-        const int ncol, const int nlay, const int ngpt, const Bool _top_at_1,
+        const int ncol, const int nlay, const Bool _top_at_1,
         Float* __restrict__ r_dir, Float* __restrict__ t_dir, Float* __restrict__ t_noscat,
         const Float* __restrict__ sfc_alb_dir, Float* __restrict__ source_up, Float* __restrict__ source_dn,
         Float* __restrict__ source_sfc, Float* __restrict__ flux_dir)
@@ -293,7 +295,7 @@ void sw_source_kernel(
 }
 
 __global__
-void apply_BC_kernel_lw(const int isfc, int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float* __restrict__ inc_flux, Float* __restrict__ flux_dn)
+void apply_BC_kernel_lw(const int isfc, int ncol, const int nlay, const Bool top_at_1, const Float* __restrict__ inc_flux, Float* __restrict__ flux_dn)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -306,34 +308,34 @@ void apply_BC_kernel_lw(const int isfc, int ncol, const int nlay, const int ngpt
 }
 
 __global__
-void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float inc_flux, Float* __restrict__ flux_dn)
+void apply_BC_kernel(const int ncol, const int nlay, const Bool top_at_1, const Float inc_flux, Float* __restrict__ flux_dn)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
     if ( (icol < ncol) )
     {
-        const int idx_out = icol + ((top_at_1 ? 0 : (nlay * ncol))); 
+        const int idx_out = icol + ((top_at_1 ? 0 : (nlay * ncol)));
         const int idx_in = icol;
         flux_dn[idx_out] = inc_flux;
     }
 }
 
 __global__
-void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float* __restrict__ inc_flux, Float* __restrict__ flux_dn)
+void apply_BC_kernel(const int ncol, const int nlay, const Bool top_at_1, const Float* __restrict__ inc_flux, Float* __restrict__ flux_dn)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
     if ( (icol < ncol) )
     {
-        const int idx_out = icol + ((top_at_1 ? 0 : (nlay * ncol))); 
+        const int idx_out = icol + ((top_at_1 ? 0 : (nlay * ncol)));
         const int idx_in = icol;
         flux_dn[idx_out] = inc_flux[idx_in];
     }
 }
 
 __global__
-void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, const Float* __restrict__ inc_flux, const Float* __restrict__ factor, Float* __restrict__ flux_dn)
+void apply_BC_kernel(const int ncol, const int nlay, const Bool top_at_1, const Float* __restrict__ inc_flux, const Float* __restrict__ factor, Float* __restrict__ flux_dn)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
-    if ( (icol < ncol) ) 
+    if ( (icol < ncol) )
     {
         const int idx_out = icol + ((top_at_1 ? 0 : (nlay * ncol)));
         flux_dn[idx_out] = inc_flux[icol] * factor[icol];
@@ -341,7 +343,7 @@ void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool
 }
 
 __global__
-void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool top_at_1, Float* __restrict__ flux_dn)
+void apply_BC_kernel(const int ncol, const int nlay, const Bool top_at_1, Float* __restrict__ flux_dn)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
     if ( (icol < ncol) )
@@ -353,7 +355,7 @@ void apply_BC_kernel(const int ncol, const int nlay, const int ngpt, const Bool
 
 __global__
 void sw_2stream_kernel(
-        const int ncol, const int nlay, const int ngpt, const Float tmin,
+        const int ncol, const int nlay, const Float tmin,
         const Float* __restrict__ tau, const Float* __restrict__ ssa,
         const Float* __restrict__ g, const Float* __restrict__ mu0,
         Float* __restrict__ r_dif, Float* __restrict__ t_dif,
@@ -404,7 +406,7 @@ void sw_2stream_kernel(
 
 /*
 __global__
-void sw_source_adding_kernel(const int ncol, const int nlay, const int ngpt, const Bool top_at_1,
+void sw_source_adding_kernel(const int ncol, const int nlay, const Bool top_at_1,
                              const Float* __restrict__ sfc_alb_dir, const Float* __restrict__ sfc_alb_dif,
                              Float* __restrict__ r_dif, Float* __restrict__ t_dif,
                              Float* __restrict__ r_dir, Float* __restrict__ t_dir, Float* __restrict__ t_noscat,
@@ -427,10 +429,10 @@ void sw_source_adding_kernel(const int ncol, const int nlay, const int ngpt, con
 }
 */
 // __global__
-// void lw_solver_noscat_gaussquad_kernel(const int ncol, const int nlay, const int ngpt, const Float eps,
+// void lw_solver_noscat_gaussquad_kernel(const int ncol, const int nlay, const Float eps,
 //                                        const Bool top_at_1, const int nmus, const Float* __restrict__ ds, const Float* __restrict__ weights,
 //                                        const Float* __restrict__ tau, const Float* __restrict__ lay_source,
-//                                        const Float* __restrict__ lev_source_inc, const Float* __restrict__ lev_source_dec, const Float* __restrict__ sfc_emis,
+//                                        const Float* __restrict__ lev_source, const Float* __restrict__ sfc_emis,
 //                                        const Float* __restrict__ sfc_src, Float* __restrict__ radn_up, Float* __restrict__ radn_dn,
 //                                        const Float* __restrict__ sfc_src_jac, Float* __restrict__ radn_up_jac, Float* __restrict__ tau_loc,
 //                                        Float* __restrict__ trans, Float* __restrict__ source_dn, Float* __restrict__ source_up,
@@ -439,23 +441,23 @@ void sw_source_adding_kernel(const int ncol, const int nlay, const int ngpt, con
 // {
 //     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
 //     const int igpt = blockIdx.y*blockDim.y + threadIdx.y;
-// 
+//
 //     if ( (icol < ncol) && (igpt < ngpt) )
 //     {
 //         lw_solver_noscat_kernel(icol, igpt, ncol, nlay, ngpt, eps, top_at_1, ds[0], weights[0], tau, lay_source,
-//                          lev_source_inc, lev_source_dec, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
+//                          lev_source, sfc_emis, sfc_src, flux_up, flux_dn, sfc_src_jac,
 //                          flux_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
 //         const int top_level = top_at_1 ? 0 : nlay;
 //         apply_BC_kernel_lw(icol, igpt, top_level, ncol, nlay, ngpt, top_at_1, flux_dn, radn_dn);
-// 
+//
 //         if (nmus > 1)
 //         {
 //             for (int imu=1; imu<nmus; ++imu)
 //             {
 //                 lw_solver_noscat_kernel(icol, igpt, ncol, nlay, ngpt, eps, top_at_1, ds[imu], weights[imu], tau, lay_source,
-//                                  lev_source_inc, lev_source_dec, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
+//                                  lev_source, sfc_emis, sfc_src, radn_up, radn_dn, sfc_src_jac,
 //                                  radn_up_jac, tau_loc, trans, source_dn, source_up, source_sfc, sfc_albedo, source_sfc_jac);
-// 
+//
 //                 for (int ilev=0; ilev<(nlay+1); ++ilev)
 //                 {
 //                     const int idx = icol + ilev*ncol + igpt*ncol*(nlay+1);
@@ -471,7 +473,7 @@ void sw_source_adding_kernel(const int ncol, const int nlay, const int ngpt, con
 
 __global__
 void add_fluxes_kernel(
-        const int ncol, const int nlev, const int ngpt,
+        const int ncol, const int nlev,
         const Float* __restrict__ radn_up, const Float* __restrict__ radn_dn, const Float* __restrict__ radn_up_jac,
         Float* __restrict__ flux_up, Float* __restrict__ flux_dn, Float* __restrict__ flux_up_jac)
 {
@@ -500,7 +502,7 @@ template<> __forceinline__ __device__ constexpr float tkmin() { return float(100
 __device__
 void sw_2stream_function(
         const int icol, const int ilay,
-        const int ncol, const int nlay, const int ngpt,
+        const int ncol, const int nlay,
         const Float* __restrict__ tau, const Float* __restrict__ ssa,
         const Float* __restrict__ g, const Float* __restrict__ mu0,
         Float* __restrict__ r_dif, Float* __restrict__ t_dif,
@@ -528,10 +530,10 @@ void sw_2stream_function(
         *t_noscat = exp(-tau[idx] * mu0_inv);
 
         const Float k_mu = k * mu0[icol];
-        
+
         const Float k_gamma3 = k * gamma3;
         const Float k_gamma4 = k * gamma4;
-        
+
         const Float fact = (abs(Float(1.) - k_mu*k_mu) > tmin<Float>()) ? Float(1.) - k_mu*k_mu : tmin<Float>();
         const Float rt_term2 = ssa[idx] * rt_term / fact;
 
@@ -552,7 +554,7 @@ void sw_2stream_function(
 
 template<Bool top_at_1> __global__
 void sw_source_2stream_kernel(
-        const int ncol, const int nlay, const int ngpt,
+        const int ncol, const int nlay,
         const Float* __restrict__ tau, const Float* __restrict__ ssa,
         const Float* __restrict__ g, const Float* __restrict__ mu0,
         Float* __restrict__ r_dif, Float* __restrict__ t_dif,
@@ -560,7 +562,7 @@ void sw_source_2stream_kernel(
         Float* __restrict__ source_sfc, Float* __restrict__ flux_dir)
 {
     const int icol = blockIdx.x*blockDim.x + threadIdx.x;
-    
+
     if ( (icol < ncol) )
     {
         if (top_at_1)
@@ -570,7 +572,7 @@ void sw_source_2stream_kernel(
 
                 Float r_dir, t_dir, t_noscat;
                 sw_2stream_function(icol, ilay,
-                        ncol, nlay, ngpt,
+                        ncol, nlay,
                         tau, ssa, g, mu0,
                         r_dif, t_dif, &r_dir, &t_dir, &t_noscat);
 
@@ -591,7 +593,7 @@ void sw_source_2stream_kernel(
             {
                 Float r_dir, t_dir, t_noscat;
                 sw_2stream_function(icol, ilay,
-                        ncol, nlay, ngpt,
+                        ncol, nlay,
                         tau, ssa, g, mu0,
                         r_dif, t_dif, &r_dir, &t_dir, &t_noscat);
 
diff --git a/src_test/Radiation_solver_rt.cu b/src_test/Radiation_solver_rt.cu
index 6852d23a..9942ea65 100644
--- a/src_test/Radiation_solver_rt.cu
+++ b/src_test/Radiation_solver_rt.cu
@@ -52,7 +52,7 @@ namespace
                 const int idx = icol + ilay*ncol;
                 tau[idx] = tau[idx] * scale_factor;
                 }
-        }    
+        }
 
         void scale_tau(Float* tau, const int ncol, const int nlay, Float scale_factor) {
                 const int block_col = 64;
@@ -405,30 +405,40 @@ Radiation_solver_longwave::Radiation_solver_longwave(
 
     this->cloud_optics_gpu = std::make_unique<Cloud_optics_rt>(
             load_and_init_cloud_optics(file_name_cloud));
+
+    //this->aerosol_optics_gpu = std::make_unique<Cloud_optics_rt>(
+    //        load_and_init_cloud_optics(file_name_cloud));
 }
 
 
 void Radiation_solver_longwave::solve_gpu(
         const bool switch_fluxes,
+        const bool switch_raytracing,
         const bool switch_cloud_optics,
+        const bool switch_aerosol_optics,
         const bool switch_single_gpt,
         const int single_gpt,
+        const Int ray_count,
+        const Vector<int> grid_cells,
+        const Vector<Float> grid_d,
+        const Vector<int> kn_grid,
         const Gas_concs_gpu& gas_concs,
+        Aerosol_concs_gpu& aerosol_concs,
         const Array_gpu<Float,2>& p_lay, const Array_gpu<Float,2>& p_lev,
         const Array_gpu<Float,2>& t_lay, const Array_gpu<Float,2>& t_lev,
         Array_gpu<Float,2>& col_dry,
         const Array_gpu<Float,1>& t_sfc, const Array_gpu<Float,2>& emis_sfc,
         const Array_gpu<Float,2>& lwp, const Array_gpu<Float,2>& iwp,
         const Array_gpu<Float,2>& rel, const Array_gpu<Float,2>& dei,
-        Array_gpu<Float,2>& tau, Array_gpu<Float,2>& lay_source,
-        Array_gpu<Float,2>& lev_source_inc, Array_gpu<Float,2>& lev_source_dec, Array_gpu<Float,1>& sfc_source,
+        const Array_gpu<Float,2>& rh,
+        Array_gpu<Float,2>& tot_tau_out, Array_gpu<Float,2>& cld_tau_out, Array_gpu<Float,2>& lay_source,
+        Array_gpu<Float,2>& lev_source, Array_gpu<Float,1>& sfc_source,
         Array_gpu<Float,2>& lw_flux_up, Array_gpu<Float,2>& lw_flux_dn, Array_gpu<Float,2>& lw_flux_net,
         Array_gpu<Float,2>& lw_gpt_flux_up, Array_gpu<Float,2>& lw_gpt_flux_dn, Array_gpu<Float,2>& lw_gpt_flux_net)
 {
 
-    throw std::runtime_error("Longwave raytracing is not implemented");
+    // throw std::runtime_error("Longwave raytracing is not implemented");
 
-    /*
     const int n_col = p_lay.dim(1);
     const int n_lay = p_lay.dim(2);
     const int n_lev = p_lev.dim(2);
@@ -438,11 +448,10 @@ void Radiation_solver_longwave::solve_gpu(
     const Bool top_at_1 = p_lay({1, 1}) < p_lay({1, n_lay});
 
     optical_props = std::make_unique<Optical_props_1scl_rt>(n_col, n_lay, *kdist_gpu);
+    cloud_optical_props = std::make_unique<Optical_props_1scl_rt>(n_col, n_lay, *cloud_optics_gpu);
+    //aerosol_optical_props = std::make_unique<Optical_props_1scl_rt>(n_col, n_lay, *aerosol_optics_gpu);
     sources = std::make_unique<Source_func_lw_rt>(n_col, n_lay, *kdist_gpu);
 
-    if (switch_cloud_optics)
-        cloud_optical_props = std::make_unique<Optical_props_1scl_rt>(n_col, n_lay, *cloud_optics_gpu);
-
     if (col_dry.size() == 0)
     {
         col_dry.set_dims({n_col, n_lay});
@@ -454,9 +463,15 @@ void Radiation_solver_longwave::solve_gpu(
         Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_lev, n_col, lw_flux_up.ptr());
         Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_lev, n_col, lw_flux_dn.ptr());
         Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_lev, n_col, lw_flux_net.ptr());
+
+        if (switch_raytracing)  { }
+
+
     }
 
     const Array<int, 2>& band_limits_gpt(this->kdist_gpu->get_band_lims_gpoint());
+    int previous_band = 0;
+
     for (int igpt=1; igpt<=n_gpt; ++igpt)
     {
         int band = 0;
@@ -469,51 +484,117 @@ void Radiation_solver_longwave::solve_gpu(
             }
         }
 
-        //kdist_gpu->gas_optics(
-        //        igpt-1,
-        //        p_lay,
-        //        p_lev,
-        //        t_lay,
-        //        t_sfc,
-        //        gas_concs,
-        //        optical_props,
-        //        *sources,
-        //        col_dry,
-        //        t_lev);
+        Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, optical_props->get_tau().ptr());
+
+        // We loop over the gas optics, due to memory constraints
+        constexpr int n_col_block = 1<<14;
+
+        auto gas_optics_subset = [&](
+                const int col_s, const int n_col_subset)
+        {
+            // Run the gas_optics on a subset.
+            kdist_gpu->gas_optics(
+                    igpt,
+                    col_s,
+                    n_col_subset,
+                    n_col,
+                    p_lay,
+                    p_lev,
+                    t_lay,
+                    t_lev,
+                    t_sfc,
+                    gas_concs,
+                    optical_props,
+                    *sources,
+                    col_dry);
+        };
+
+        const int n_blocks = n_col / n_col_block;
+        const int n_col_residual = n_col % n_col_block;
+
+        if (n_blocks > 0)
+        {
+            for (int n=0; n<n_blocks; ++n)
+            {
+                const int col_s = n*n_col_block;
+                gas_optics_subset(col_s, n_col_block);
+            }
+        }
+
+        if (n_col_residual > 0)
+        {
+            const int col_s = n_blocks*n_col_block;
+            gas_optics_subset(col_s, n_col_residual);
+        }
 
         if (switch_cloud_optics)
         {
-            cloud_optics_gpu->cloud_optics(
-                    band,
-                    lwp,
-                    iwp,
-                    rel,
-                    dei,
-                    *cloud_optical_props);
-            // cloud->delta_scale();
+            if (band > previous_band)
+            {
+                cloud_optics_gpu->cloud_optics(
+                        band,
+                        lwp,
+                        iwp,
+                        rel,
+                        dei,
+                        *cloud_optical_props);
+                // cloud->delta_scale();
 
+            }
             // Add the cloud optical props to the gas optical properties.
             add_to(
                     dynamic_cast<Optical_props_1scl_rt&>(*optical_props),
                     dynamic_cast<Optical_props_1scl_rt&>(*cloud_optical_props));
         }
+        else
+        {
+            Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_optical_props->get_tau().ptr());
+            //Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_optical_props->get_ssa().ptr());
+            //Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_optical_props->get_g().ptr());
+        }
+
+        // NO AEROSOL IMPLEMENTATION FOR LONGWAVE RADIATION (YET)!
+        /*if (switch_aerosol_optics)
+        {
+            if (band > previous_band)
+            {
+                aerosol_optics_gpu->aerosol_optics(
+                        band,
+                        aerosol_concs,
+                        rh, p_lev,
+                        *aerosol_optical_props);
+                // aerosol->delta_scale();
+
+            }
+            // Add the cloud optical props to the gas optical properties.
+            add_to(
+                    dynamic_cast<Optical_props_1scl_rt&>(*optical_props),
+                    dynamic_cast<Optical_props_1scl_rt&>(*aerosol_optical_props));
+        }
+        else
+        {
+            Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_aerosol_props->get_tau().ptr());
+            Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_aerosol_props->get_ssa().ptr());
+            Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, cloud_aerosol_props->get_g().ptr());
+        }
+        */
+
 
         // Store the optical properties, if desired.
         if (switch_single_gpt && igpt == single_gpt)
         {
             lay_source = (*sources).get_lay_source();
-            lev_source_inc = (*sources).get_lev_source_inc();
-            lev_source_dec = (*sources).get_lev_source_dec();
+            lev_source = (*sources).get_lev_source();
             sfc_source = (*sources).get_sfc_source();
 
-
-            //gpt_combine_kernel_launcher_cuda_rt::get_from_gpoint(
-            //        n_col, n_lay, igpt-1, tau.ptr(), lay_source.ptr(), lev_source_inc.ptr(), lev_source_dec.ptr(),
-            //        optical_props->get_tau().ptr(), (*sources).get_lay_source().ptr(),
-            //        (*sources).get_lev_source_inc().ptr(), (*sources).get_lev_source_dec().ptr());
-
-            //gpt_combine_kernel_launcher_cuda_rt::get_from_gpoint(
-            //        n_col, igpt-1, sfc_source.ptr(), (*sources).get_sfc_source().ptr());
+            tot_tau_out = optical_props->get_tau();
+            cld_tau_out = cloud_optical_props->get_tau();
+            //tot_ssa_out = optical_props->get_ssa();
+            //cld_ssa_out = cloud_optical_props->get_ssa();
+            //cld_asy_out = cloud_optical_props->get_g();
+            //aer_tau_out = aerosol_optical_props->get_tau();
+            //aer_ssa_out = aerosol_optical_props->get_ssa();
+            //aer_asy_out = aerosol_optical_props->get_g();
         }
 
 
@@ -522,7 +603,7 @@ void Radiation_solver_longwave::solve_gpu(
             constexpr int n_ang = 1;
 
             std::unique_ptr<Fluxes_broadband_rt> fluxes =
-                    std::make_unique<Fluxes_broadband_rt>(n_col, 1, n_lev);
+                    std::make_unique<Fluxes_broadband_rt>(grid_cells.x, grid_cells.y, grid_cells.z, n_lev);
 
             rte_lw.rte_lw(
                     optical_props,
@@ -546,13 +627,9 @@ void Radiation_solver_longwave::solve_gpu(
                 lw_gpt_flux_up = (*fluxes).get_flux_up();
                 lw_gpt_flux_dn = (*fluxes).get_flux_dn();
                 lw_gpt_flux_net = (*fluxes).get_flux_net();
-                //gpt_combine_kernel_launcher_cuda_rt::get_from_gpoint(
-                //        n_col, n_lev, igpt-1, lw_gpt_flux_up.ptr(), lw_gpt_flux_dn.ptr(), lw_gpt_flux_net.ptr(),
-                //        (*fluxes).get_flux_up().ptr(), (*fluxes).get_flux_dn().ptr(), (*fluxes).get_flux_net().ptr());
             }
         }
     }
-    */
 }
 
 Radiation_solver_shortwave::Radiation_solver_shortwave(
@@ -615,7 +692,7 @@ void Radiation_solver_shortwave::solve_gpu(
         const Array_gpu<Float,2>& lwp, const Array_gpu<Float,2>& iwp,
         const Array_gpu<Float,2>& rel, const Array_gpu<Float,2>& dei,
         const Array_gpu<Float,2>& rh,
-        const Aerosol_concs_gpu& aerosol_concs,
+        Aerosol_concs_gpu& aerosol_concs,
         Array_gpu<Float,2>& tot_tau_out, Array_gpu<Float,2>& tot_ssa_out,
         Array_gpu<Float,2>& cld_tau_out, Array_gpu<Float,2>& cld_ssa_out, Array_gpu<Float,2>& cld_asy_out,
         Array_gpu<Float,2>& aer_tau_out, Array_gpu<Float,2>& aer_ssa_out, Array_gpu<Float,2>& aer_asy_out,
@@ -653,9 +730,6 @@ void Radiation_solver_shortwave::solve_gpu(
 
     Array_gpu<Float,1> toa_src({n_col});
 
-    Array<int,2> cld_mask_liq({n_col, n_lay});
-    Array<int,2> cld_mask_ice({n_col, n_lay});
-
     Array_gpu<Float,2> mie_cdfs_sub;
     Array_gpu<Float,3> mie_angs_sub;
 
@@ -695,6 +769,8 @@ void Radiation_solver_shortwave::solve_gpu(
             }
         }
 
+        Gas_optics_rrtmgp_kernels_cuda_rt::zero_array(n_col, n_lay, optical_props->get_tau().ptr());
+
         // We loop over the gas optics, due to memory constraints
         constexpr int n_col_block = 1<<14;
 
@@ -746,10 +822,10 @@ void Radiation_solver_shortwave::solve_gpu(
         {
             if (band > previous_band)
             {
-                Aerosol_concs_gpu aerosol_concs_subset(aerosol_concs, 1, n_col);
+                // Aerosol_concs_gpu aerosol_concs_subset(aerosol_concs, 1, n_col);
                 aerosol_optics_gpu->aerosol_optics(
                         band,
-                        aerosol_concs_subset,
+                        aerosol_concs,
                         rh, p_lev,
                         *aerosol_optical_props);
                 if (switch_delta_aerosol)
diff --git a/src_test/test_rt_lite.cu b/src_test/test_rt_lite.cu
index b0e45351..142c361e 100644
--- a/src_test/test_rt_lite.cu
+++ b/src_test/test_rt_lite.cu
@@ -366,11 +366,11 @@ void solve_radiation(int argc, char** argv)
         flux_dn_dir_2stream.set_dims({ncol, n_lay+1});
 
         Rte_sw_rt rte_sw;
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, n_lay, 1, 0, tod_dir * cos(zenith_angle), flux_dn_dir_2stream.ptr());
-        Rte_solver_kernels_cuda_rt::apply_BC(ncol, n_lay, 1, 0, flux_dn_2stream.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, n_lay, 0, tod_dir * cos(zenith_angle), flux_dn_dir_2stream.ptr());
+        Rte_solver_kernels_cuda_rt::apply_BC(ncol, n_lay, 0, flux_dn_2stream.ptr());
 
         Rte_solver_kernels_cuda_rt::sw_solver_2stream(
-            ncol, n_lay, 1, 0,
+            ncol, n_lay, 0,
             tot_tau_g.ptr(),
             tot_ssa_g.ptr(),
             tot_asy_g.ptr(),
diff --git a/src_test/test_rte_rrtmgp_bw.cu b/src_test/test_rte_rrtmgp_bw.cu
index 3faaae62..87bcc929 100644
--- a/src_test/test_rte_rrtmgp_bw.cu
+++ b/src_test/test_rte_rrtmgp_bw.cu
@@ -401,7 +401,7 @@ void solve_radiation(int argc, char** argv)
             rel = std::move(input_nc.get_variable<Float>("rel", {n_lay, n_col_y, n_col_x}));
         }
 
-        if(switch_ice_cloud_optics)
+        if (switch_ice_cloud_optics)
         {
             iwp.set_dims({n_col, n_lay});
             iwp = std::move(input_nc.get_variable<Float>("iwp", {n_lay, n_col_y, n_col_x}));
diff --git a/src_test/test_rte_rrtmgp_rt.cu b/src_test/test_rte_rrtmgp_rt.cu
index c02841d1..ef81fb0e 100644
--- a/src_test/test_rte_rrtmgp_rt.cu
+++ b/src_test/test_rte_rrtmgp_rt.cu
@@ -276,8 +276,9 @@ void solve_radiation(int argc, char** argv)
 
     if (switch_longwave)
     {
-        std::string error = "No longwave radiation implemented in the ray tracer";
-        throw std::runtime_error(error);
+        Status::print_message("Note: no longwave radiation implemented in the ray tracer, yet");
+        // std::string error = "No longwave radiation implemented in the ray tracer";
+        // throw std::runtime_error(error);
     }
 
     if (!switch_twostream && !switch_raytracing) {
@@ -483,7 +484,6 @@ void solve_radiation(int argc, char** argv)
             const Array<Float,2>& gas = aerosol_concs.get_vmr(aerosol_name);
             if (gas.size() > 1) {
                 if (gas.get_dims()[0] == 1){
-                    printf("----");
                     aerosol_concs.set_vmr(aerosol_name, aerosol_concs.get_vmr(aerosol_name).subset({ {{1,n_col}, {1, n_lay}}} ));
                 }
             }
@@ -623,18 +623,18 @@ void solve_radiation(int argc, char** argv)
         Array<Float,1> t_sfc(input_nc.get_variable<Float>("t_sfc", {n_col_y, n_col_x}), {n_col});
 
         // Create output arrays.
-        Array_gpu<Float,2> lw_tau;
+        Array_gpu<Float,2> lw_tau_tot;
+        Array_gpu<Float,2> lw_tau_cld;
         Array_gpu<Float,2> lay_source;
-        Array_gpu<Float,2> lev_source_inc;
-        Array_gpu<Float,2> lev_source_dec;
+        Array_gpu<Float,2> lev_source;
         Array_gpu<Float,1> sfc_source;
 
         if (switch_single_gpt)
         {
-            lw_tau        .set_dims({n_col, n_lay});
+            lw_tau_tot    .set_dims({n_col, n_lay});
+            lw_tau_cld    .set_dims({n_col, n_lay});
             lay_source    .set_dims({n_col, n_lay});
-            lev_source_inc.set_dims({n_col, n_lay});
-            lev_source_dec.set_dims({n_col, n_lay});
+            lev_source    .set_dims({n_col, n_lev});
             sfc_source    .set_dims({n_col});
         }
 
@@ -678,6 +678,10 @@ void solve_radiation(int argc, char** argv)
             Array_gpu<Float,2> iwp_gpu(iwp);
             Array_gpu<Float,2> rel_gpu(rel);
             Array_gpu<Float,2> dei_gpu(dei);
+            Array_gpu<Float,2> rh_gpu(rh);
+
+            // note: aerosol optics not yet implemented in LW
+            Aerosol_concs_gpu aerosol_concs_gpu(aerosol_concs);
 
 
             cudaDeviceSynchronize();
@@ -690,17 +694,24 @@ void solve_radiation(int argc, char** argv)
 
             rad_lw.solve_gpu(
                     switch_fluxes,
+                    switch_raytracing,
                     switch_cloud_optics,
+                    switch_aerosol_optics,
                     switch_single_gpt,
                     single_gpt,
+                    photons_per_pixel,
+                    grid_cells,
+                    grid_d,
+                    kn_grid,
                     gas_concs_gpu,
+                    aerosol_concs_gpu,
                     p_lay_gpu, p_lev_gpu,
                     t_lay_gpu, t_lev_gpu,
                     col_dry_gpu,
                     t_sfc_gpu, emis_sfc_gpu,
                     lwp_gpu, iwp_gpu,
-                    rel_gpu, dei_gpu,
-                    lw_tau, lay_source, lev_source_inc, lev_source_dec, sfc_source,
+                    rel_gpu, dei_gpu, rh_gpu,
+                    lw_tau_tot, lw_tau_cld, lay_source, lev_source, sfc_source,
                     lw_flux_up, lw_flux_dn, lw_flux_net,
                     lw_gpt_flux_up, lw_gpt_flux_dn, lw_gpt_flux_net);
 
@@ -719,22 +730,20 @@ void solve_radiation(int argc, char** argv)
         run_solver();
 
         // Profiling step;
-        cudaProfilerStart();
-        run_solver();
-        cudaProfilerStop();
-
-        constexpr int n_measures=10;
-        for (int n=0; n<n_measures; ++n)
+        if (switch_profiling)
+        {
+            cudaProfilerStart();
             run_solver();
-
+            cudaProfilerStop();
+        }
 
         //// Store the output.
         Status::print_message("Storing the longwave output.");
-        Array<Float,2> lw_tau_cpu(lw_tau);
+        Array<Float,2> lw_tau_tot_cpu(lw_tau_tot);
+        Array<Float,2> lw_tau_cld_cpu(lw_tau_cld);
         Array<Float,2> lay_source_cpu(lay_source);
         Array<Float,1> sfc_source_cpu(sfc_source);
-        Array<Float,2> lev_source_inc_cpu(lev_source_inc);
-        Array<Float,2> lev_source_dec_cpu(lev_source_dec);
+        Array<Float,2> lev_source_cpu(lev_source);
         Array<Float,2> lw_flux_up_cpu(lw_flux_up);
         Array<Float,2> lw_flux_dn_cpu(lw_flux_dn);
         Array<Float,2> lw_flux_net_cpu(lw_flux_net);
@@ -753,18 +762,19 @@ void solve_radiation(int argc, char** argv)
             auto nc_lw_band_lims_gpt = output_nc.add_variable<int>("lw_band_lims_gpt", {"band_lw", "pair"});
             nc_lw_band_lims_gpt.insert(rad_lw.get_band_lims_gpoint_gpu().v(), {0, 0});
 
-            auto nc_lw_tau = output_nc.add_variable<Float>("lw_tau", {"lay", "y", "x"});
-            nc_lw_tau.insert(lw_tau_cpu.v(), {0, 0, 0});
+            auto nc_lw_tau_tot = output_nc.add_variable<Float>("lw_tau_tot", {"lay", "y", "x"});
+            nc_lw_tau_tot.insert(lw_tau_tot_cpu.v(), {0, 0, 0});
+
+            auto nc_lw_tau_cld = output_nc.add_variable<Float>("lw_tau_cld", {"lay", "y", "x"});
+            nc_lw_tau_cld.insert(lw_tau_cld_cpu.v(), {0, 0, 0});
 
             auto nc_lay_source     = output_nc.add_variable<Float>("lay_source"    , {"lay", "y", "x"});
-            auto nc_lev_source_inc = output_nc.add_variable<Float>("lev_source_inc", {"lay", "y", "x"});
-            auto nc_lev_source_dec = output_nc.add_variable<Float>("lev_source_dec", {"lay", "y", "x"});
+            auto nc_lev_source = output_nc.add_variable<Float>("lev_source", {"lev", "y", "x"});
 
             auto nc_sfc_source = output_nc.add_variable<Float>("sfc_source", {"y", "x"});
 
             nc_lay_source.insert    (lay_source_cpu.v()    , {0, 0, 0});
-            nc_lev_source_inc.insert(lev_source_inc_cpu.v(), {0, 0, 0});
-            nc_lev_source_dec.insert(lev_source_dec_cpu.v(), {0, 0, 0});
+            nc_lev_source.insert(lev_source_cpu.v(), {0, 0, 0});
 
             nc_sfc_source.insert(sfc_source_cpu.v(), {0, 0});
         }