import sys
import numpy as np
import ctypes as ct
# Stub code for OpenCL setup.

import pyopencl as cl
import numpy as np
import sys

if cl.version.VERSION < (2015,2):
    raise Exception('Futhark requires at least PyOpenCL version 2015.2.  Installed version is %s.' %
                    cl.version.VERSION_TEXT)

def parse_preferred_device(s):
    pref_num = 0
    if len(s) > 1 and s[0] == '#':
        i = 1
        while i < len(s):
            if not s[i].isdigit():
                break
            else:
                pref_num = pref_num * 10 + int(s[i])
            i += 1
        while i < len(s) and s[i].isspace():
            i += 1
        return (s[i:], pref_num)
    else:
        return (s, 0)

def get_prefered_context(interactive=False, platform_pref=None, device_pref=None):
    if device_pref != None:
        (device_pref, device_num) = parse_preferred_device(device_pref)
    else:
        device_num = 0

    if interactive:
        return cl.create_some_context(interactive=True)

    def blacklisted(p, d):
        return platform_pref == None and device_pref == None and \
            p.name == "Apple" and d.name.find("Intel(R) Core(TM)") >= 0
    def platform_ok(p):
        return not platform_pref or p.name.find(platform_pref) >= 0
    def device_ok(d):
        return not device_pref or d.name.find(device_pref) >= 0

    device_matches = 0

    for p in cl.get_platforms():
        if not platform_ok(p):
            continue
        for d in p.get_devices():
            if blacklisted(p,d) or not device_ok(d):
                continue
            if device_matches == device_num:
                return cl.Context(devices=[d])
            else:
                device_matches += 1
    raise Exception('No OpenCL platform and device matching constraints found.')

def size_assignment(s):
    name, value = s.split('=')
    return (name, int(value))

def check_types(self, required_types):
    if 'f64' in required_types:
        if self.device.get_info(cl.device_info.PREFERRED_VECTOR_WIDTH_DOUBLE) == 0:
            raise Exception('Program uses double-precision floats, but this is not supported on chosen device: %s' % self.device.name)

def apply_size_heuristics(self, size_heuristics, sizes):
    for (platform_name, device_type, size, value) in size_heuristics:
        if sizes[size] == None \
           and self.platform.name.find(platform_name) >= 0 \
           and self.device.type == device_type:
               if type(value) == str:
                   sizes[size] = self.device.get_info(getattr(cl.device_info,value))
               else:
                   sizes[size] = value
    return sizes

def initialise_opencl_object(self,
                             program_src='',
                             command_queue=None,
                             interactive=False,
                             platform_pref=None,
                             device_pref=None,
                             default_group_size=None,
                             default_num_groups=None,
                             default_tile_size=None,
                             default_threshold=None,
                             size_heuristics=[],
                             required_types=[],
                             all_sizes={},
                             user_sizes={}):
    if command_queue is None:
        self.ctx = get_prefered_context(interactive, platform_pref, device_pref)
        self.queue = cl.CommandQueue(self.ctx)
    else:
        self.ctx = command_queue.context
        self.queue = command_queue
    self.device = self.queue.device
    self.platform = self.device.platform
    self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))
    device_type = self.device.type

    check_types(self, required_types)

    max_group_size = int(self.device.max_work_group_size)
    max_tile_size = int(np.sqrt(self.device.max_work_group_size))

    self.max_group_size = max_group_size
    self.max_tile_size = max_tile_size
    self.max_threshold = 0
    self.max_num_groups = 0
    self.max_local_memory = int(self.device.local_mem_size)
    self.free_list = {}

    if 'default_group_size' in sizes:
        default_group_size = sizes['default_group_size']
        del sizes['default_group_size']

    if 'default_num_groups' in sizes:
        default_num_groups = sizes['default_num_groups']
        del sizes['default_num_groups']

    if 'default_tile_size' in sizes:
        default_tile_size = sizes['default_tile_size']
        del sizes['default_tile_size']

    if 'default_threshold' in sizes:
        default_threshold = sizes['default_threshold']
        del sizes['default_threshold']

    default_group_size_set = default_group_size != None
    default_tile_size_set = default_tile_size != None
    default_sizes = apply_size_heuristics(self, size_heuristics,
                                          {'group_size': default_group_size,
                                           'tile_size': default_tile_size,
                                           'num_groups': default_num_groups,
                                           'lockstep_width': None,
                                           'threshold': default_threshold})
    default_group_size = default_sizes['group_size']
    default_num_groups = default_sizes['num_groups']
    default_threshold = default_sizes['threshold']
    default_tile_size = default_sizes['tile_size']
    lockstep_width = default_sizes['lockstep_width']

    if default_group_size > max_group_size:
        if default_group_size_set:
            sys.stderr.write('Note: Device limits group size to {} (down from {})\n'.
                             format(max_tile_size, default_group_size))
        default_group_size = max_group_size

    if default_tile_size > max_tile_size:
        if default_tile_size_set:
            sys.stderr.write('Note: Device limits tile size to {} (down from {})\n'.
                             format(max_tile_size, default_tile_size))
        default_tile_size = max_tile_size

    for (k,v) in user_sizes.items():
        if k in all_sizes:
            all_sizes[k]['value'] = v
        else:
            raise Exception('Unknown size: {}\nKnown sizes: {}'.format(k, ' '.join(all_sizes.keys())))

    self.sizes = {}
    for (k,v) in all_sizes.items():
        if v['class'] == 'group_size':
            max_value = max_group_size
            default_value = default_group_size
        elif v['class'] == 'num_groups':
            max_value = max_group_size # Intentional!
            default_value = default_num_groups
        elif v['class'] == 'tile_size':
            max_value = max_tile_size
            default_value = default_tile_size
        elif v['class'].startswith('threshold'):
            max_value = None
            default_value = default_threshold
        else:
            raise Exception('Unknown size class for size \'{}\': {}'.format(k, v['class']))
        if v['value'] == None:
            self.sizes[k] = default_value
        elif max_value != None and v['value'] > max_value:
            sys.stderr.write('Note: Device limits {} to {} (down from {}\n'.
                             format(k, max_value, v['value']))
            self.sizes[k] = max_value
        else:
            self.sizes[k] = v['value']

    # XXX: we perform only a subset of z-encoding here.  Really, the
    # compiler should provide us with the variables to which
    # parameters are mapped.
    if (len(program_src) >= 0):
        return cl.Program(self.ctx, program_src).build(
            ["-DLOCKSTEP_WIDTH={}".format(lockstep_width)]
            + ["-D{}={}".format(s.replace('z', 'zz').replace('.', 'zi'),v) for (s,v) in self.sizes.items()])

def opencl_alloc(self, min_size, tag):
    min_size = 1 if min_size == 0 else min_size
    assert min_size > 0
    return self.pool.allocate(min_size)

def opencl_free_all(self):
    self.pool.free_held()
import pyopencl.array
import time
import argparse
sizes = {}
synchronous = False
preferred_platform = None
preferred_device = None
default_threshold = None
default_group_size = None
default_num_groups = None
default_tile_size = None
fut_opencl_src = """#ifdef cl_clang_storage_class_specifiers
#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
#endif
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void dummy_kernel(__global unsigned char *dummy, int n)
{
    const int thread_gid = get_global_id(0);
    
    if (thread_gid >= n)
        return;
}
typedef char int8_t;
typedef short int16_t;
typedef int int32_t;
typedef long int64_t;
typedef uchar uint8_t;
typedef ushort uint16_t;
typedef uint uint32_t;
typedef ulong uint64_t;
#define ALIGNED_LOCAL_MEMORY(m,size) __local unsigned char m[size] __attribute__ ((align))
#ifdef cl_nv_pragma_unroll
static inline void mem_fence_global()
{
    asm("membar.gl;");
}
#else
static inline void mem_fence_global()
{
    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
}
#endif
static inline void mem_fence_local()
{
    mem_fence(CLK_LOCAL_MEM_FENCE);
}
static inline int8_t add8(int8_t x, int8_t y)
{
    return x + y;
}
static inline int16_t add16(int16_t x, int16_t y)
{
    return x + y;
}
static inline int32_t add32(int32_t x, int32_t y)
{
    return x + y;
}
static inline int64_t add64(int64_t x, int64_t y)
{
    return x + y;
}
static inline int8_t sub8(int8_t x, int8_t y)
{
    return x - y;
}
static inline int16_t sub16(int16_t x, int16_t y)
{
    return x - y;
}
static inline int32_t sub32(int32_t x, int32_t y)
{
    return x - y;
}
static inline int64_t sub64(int64_t x, int64_t y)
{
    return x - y;
}
static inline int8_t mul8(int8_t x, int8_t y)
{
    return x * y;
}
static inline int16_t mul16(int16_t x, int16_t y)
{
    return x * y;
}
static inline int32_t mul32(int32_t x, int32_t y)
{
    return x * y;
}
static inline int64_t mul64(int64_t x, int64_t y)
{
    return x * y;
}
static inline uint8_t udiv8(uint8_t x, uint8_t y)
{
    return x / y;
}
static inline uint16_t udiv16(uint16_t x, uint16_t y)
{
    return x / y;
}
static inline uint32_t udiv32(uint32_t x, uint32_t y)
{
    return x / y;
}
static inline uint64_t udiv64(uint64_t x, uint64_t y)
{
    return x / y;
}
static inline uint8_t umod8(uint8_t x, uint8_t y)
{
    return x % y;
}
static inline uint16_t umod16(uint16_t x, uint16_t y)
{
    return x % y;
}
static inline uint32_t umod32(uint32_t x, uint32_t y)
{
    return x % y;
}
static inline uint64_t umod64(uint64_t x, uint64_t y)
{
    return x % y;
}
static inline int8_t sdiv8(int8_t x, int8_t y)
{
    int8_t q = x / y;
    int8_t r = x % y;
    
    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int16_t sdiv16(int16_t x, int16_t y)
{
    int16_t q = x / y;
    int16_t r = x % y;
    
    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int32_t sdiv32(int32_t x, int32_t y)
{
    int32_t q = x / y;
    int32_t r = x % y;
    
    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int64_t sdiv64(int64_t x, int64_t y)
{
    int64_t q = x / y;
    int64_t r = x % y;
    
    return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0);
}
static inline int8_t smod8(int8_t x, int8_t y)
{
    int8_t r = x % y;
    
    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int16_t smod16(int16_t x, int16_t y)
{
    int16_t r = x % y;
    
    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int32_t smod32(int32_t x, int32_t y)
{
    int32_t r = x % y;
    
    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int64_t smod64(int64_t x, int64_t y)
{
    int64_t r = x % y;
    
    return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y);
}
static inline int8_t squot8(int8_t x, int8_t y)
{
    return x / y;
}
static inline int16_t squot16(int16_t x, int16_t y)
{
    return x / y;
}
static inline int32_t squot32(int32_t x, int32_t y)
{
    return x / y;
}
static inline int64_t squot64(int64_t x, int64_t y)
{
    return x / y;
}
static inline int8_t srem8(int8_t x, int8_t y)
{
    return x % y;
}
static inline int16_t srem16(int16_t x, int16_t y)
{
    return x % y;
}
static inline int32_t srem32(int32_t x, int32_t y)
{
    return x % y;
}
static inline int64_t srem64(int64_t x, int64_t y)
{
    return x % y;
}
static inline int8_t smin8(int8_t x, int8_t y)
{
    return x < y ? x : y;
}
static inline int16_t smin16(int16_t x, int16_t y)
{
    return x < y ? x : y;
}
static inline int32_t smin32(int32_t x, int32_t y)
{
    return x < y ? x : y;
}
static inline int64_t smin64(int64_t x, int64_t y)
{
    return x < y ? x : y;
}
static inline uint8_t umin8(uint8_t x, uint8_t y)
{
    return x < y ? x : y;
}
static inline uint16_t umin16(uint16_t x, uint16_t y)
{
    return x < y ? x : y;
}
static inline uint32_t umin32(uint32_t x, uint32_t y)
{
    return x < y ? x : y;
}
static inline uint64_t umin64(uint64_t x, uint64_t y)
{
    return x < y ? x : y;
}
static inline int8_t smax8(int8_t x, int8_t y)
{
    return x < y ? y : x;
}
static inline int16_t smax16(int16_t x, int16_t y)
{
    return x < y ? y : x;
}
static inline int32_t smax32(int32_t x, int32_t y)
{
    return x < y ? y : x;
}
static inline int64_t smax64(int64_t x, int64_t y)
{
    return x < y ? y : x;
}
static inline uint8_t umax8(uint8_t x, uint8_t y)
{
    return x < y ? y : x;
}
static inline uint16_t umax16(uint16_t x, uint16_t y)
{
    return x < y ? y : x;
}
static inline uint32_t umax32(uint32_t x, uint32_t y)
{
    return x < y ? y : x;
}
static inline uint64_t umax64(uint64_t x, uint64_t y)
{
    return x < y ? y : x;
}
static inline uint8_t shl8(uint8_t x, uint8_t y)
{
    return x << y;
}
static inline uint16_t shl16(uint16_t x, uint16_t y)
{
    return x << y;
}
static inline uint32_t shl32(uint32_t x, uint32_t y)
{
    return x << y;
}
static inline uint64_t shl64(uint64_t x, uint64_t y)
{
    return x << y;
}
static inline uint8_t lshr8(uint8_t x, uint8_t y)
{
    return x >> y;
}
static inline uint16_t lshr16(uint16_t x, uint16_t y)
{
    return x >> y;
}
static inline uint32_t lshr32(uint32_t x, uint32_t y)
{
    return x >> y;
}
static inline uint64_t lshr64(uint64_t x, uint64_t y)
{
    return x >> y;
}
static inline int8_t ashr8(int8_t x, int8_t y)
{
    return x >> y;
}
static inline int16_t ashr16(int16_t x, int16_t y)
{
    return x >> y;
}
static inline int32_t ashr32(int32_t x, int32_t y)
{
    return x >> y;
}
static inline int64_t ashr64(int64_t x, int64_t y)
{
    return x >> y;
}
static inline uint8_t and8(uint8_t x, uint8_t y)
{
    return x & y;
}
static inline uint16_t and16(uint16_t x, uint16_t y)
{
    return x & y;
}
static inline uint32_t and32(uint32_t x, uint32_t y)
{
    return x & y;
}
static inline uint64_t and64(uint64_t x, uint64_t y)
{
    return x & y;
}
static inline uint8_t or8(uint8_t x, uint8_t y)
{
    return x | y;
}
static inline uint16_t or16(uint16_t x, uint16_t y)
{
    return x | y;
}
static inline uint32_t or32(uint32_t x, uint32_t y)
{
    return x | y;
}
static inline uint64_t or64(uint64_t x, uint64_t y)
{
    return x | y;
}
static inline uint8_t xor8(uint8_t x, uint8_t y)
{
    return x ^ y;
}
static inline uint16_t xor16(uint16_t x, uint16_t y)
{
    return x ^ y;
}
static inline uint32_t xor32(uint32_t x, uint32_t y)
{
    return x ^ y;
}
static inline uint64_t xor64(uint64_t x, uint64_t y)
{
    return x ^ y;
}
static inline char ult8(uint8_t x, uint8_t y)
{
    return x < y;
}
static inline char ult16(uint16_t x, uint16_t y)
{
    return x < y;
}
static inline char ult32(uint32_t x, uint32_t y)
{
    return x < y;
}
static inline char ult64(uint64_t x, uint64_t y)
{
    return x < y;
}
static inline char ule8(uint8_t x, uint8_t y)
{
    return x <= y;
}
static inline char ule16(uint16_t x, uint16_t y)
{
    return x <= y;
}
static inline char ule32(uint32_t x, uint32_t y)
{
    return x <= y;
}
static inline char ule64(uint64_t x, uint64_t y)
{
    return x <= y;
}
static inline char slt8(int8_t x, int8_t y)
{
    return x < y;
}
static inline char slt16(int16_t x, int16_t y)
{
    return x < y;
}
static inline char slt32(int32_t x, int32_t y)
{
    return x < y;
}
static inline char slt64(int64_t x, int64_t y)
{
    return x < y;
}
static inline char sle8(int8_t x, int8_t y)
{
    return x <= y;
}
static inline char sle16(int16_t x, int16_t y)
{
    return x <= y;
}
static inline char sle32(int32_t x, int32_t y)
{
    return x <= y;
}
static inline char sle64(int64_t x, int64_t y)
{
    return x <= y;
}
static inline int8_t pow8(int8_t x, int8_t y)
{
    int8_t res = 1, rem = y;
    
    while (rem != 0) {
        if (rem & 1)
            res *= x;
        rem >>= 1;
        x *= x;
    }
    return res;
}
static inline int16_t pow16(int16_t x, int16_t y)
{
    int16_t res = 1, rem = y;
    
    while (rem != 0) {
        if (rem & 1)
            res *= x;
        rem >>= 1;
        x *= x;
    }
    return res;
}
static inline int32_t pow32(int32_t x, int32_t y)
{
    int32_t res = 1, rem = y;
    
    while (rem != 0) {
        if (rem & 1)
            res *= x;
        rem >>= 1;
        x *= x;
    }
    return res;
}
static inline int64_t pow64(int64_t x, int64_t y)
{
    int64_t res = 1, rem = y;
    
    while (rem != 0) {
        if (rem & 1)
            res *= x;
        rem >>= 1;
        x *= x;
    }
    return res;
}
static inline bool itob_i8_bool(int8_t x)
{
    return x;
}
static inline bool itob_i16_bool(int16_t x)
{
    return x;
}
static inline bool itob_i32_bool(int32_t x)
{
    return x;
}
static inline bool itob_i64_bool(int64_t x)
{
    return x;
}
static inline int8_t btoi_bool_i8(bool x)
{
    return x;
}
static inline int16_t btoi_bool_i16(bool x)
{
    return x;
}
static inline int32_t btoi_bool_i32(bool x)
{
    return x;
}
static inline int64_t btoi_bool_i64(bool x)
{
    return x;
}
#define sext_i8_i8(x) ((int8_t) (int8_t) x)
#define sext_i8_i16(x) ((int16_t) (int8_t) x)
#define sext_i8_i32(x) ((int32_t) (int8_t) x)
#define sext_i8_i64(x) ((int64_t) (int8_t) x)
#define sext_i16_i8(x) ((int8_t) (int16_t) x)
#define sext_i16_i16(x) ((int16_t) (int16_t) x)
#define sext_i16_i32(x) ((int32_t) (int16_t) x)
#define sext_i16_i64(x) ((int64_t) (int16_t) x)
#define sext_i32_i8(x) ((int8_t) (int32_t) x)
#define sext_i32_i16(x) ((int16_t) (int32_t) x)
#define sext_i32_i32(x) ((int32_t) (int32_t) x)
#define sext_i32_i64(x) ((int64_t) (int32_t) x)
#define sext_i64_i8(x) ((int8_t) (int64_t) x)
#define sext_i64_i16(x) ((int16_t) (int64_t) x)
#define sext_i64_i32(x) ((int32_t) (int64_t) x)
#define sext_i64_i64(x) ((int64_t) (int64_t) x)
#define zext_i8_i8(x) ((uint8_t) (uint8_t) x)
#define zext_i8_i16(x) ((uint16_t) (uint8_t) x)
#define zext_i8_i32(x) ((uint32_t) (uint8_t) x)
#define zext_i8_i64(x) ((uint64_t) (uint8_t) x)
#define zext_i16_i8(x) ((uint8_t) (uint16_t) x)
#define zext_i16_i16(x) ((uint16_t) (uint16_t) x)
#define zext_i16_i32(x) ((uint32_t) (uint16_t) x)
#define zext_i16_i64(x) ((uint64_t) (uint16_t) x)
#define zext_i32_i8(x) ((uint8_t) (uint32_t) x)
#define zext_i32_i16(x) ((uint16_t) (uint32_t) x)
#define zext_i32_i32(x) ((uint32_t) (uint32_t) x)
#define zext_i32_i64(x) ((uint64_t) (uint32_t) x)
#define zext_i64_i8(x) ((uint8_t) (uint64_t) x)
#define zext_i64_i16(x) ((uint16_t) (uint64_t) x)
#define zext_i64_i32(x) ((uint32_t) (uint64_t) x)
#define zext_i64_i64(x) ((uint64_t) (uint64_t) x)
static inline float fdiv32(float x, float y)
{
    return x / y;
}
static inline float fadd32(float x, float y)
{
    return x + y;
}
static inline float fsub32(float x, float y)
{
    return x - y;
}
static inline float fmul32(float x, float y)
{
    return x * y;
}
static inline float fmin32(float x, float y)
{
    return x < y ? x : y;
}
static inline float fmax32(float x, float y)
{
    return x < y ? y : x;
}
static inline float fpow32(float x, float y)
{
    return pow(x, y);
}
static inline char cmplt32(float x, float y)
{
    return x < y;
}
static inline char cmple32(float x, float y)
{
    return x <= y;
}
static inline float sitofp_i8_f32(int8_t x)
{
    return x;
}
static inline float sitofp_i16_f32(int16_t x)
{
    return x;
}
static inline float sitofp_i32_f32(int32_t x)
{
    return x;
}
static inline float sitofp_i64_f32(int64_t x)
{
    return x;
}
static inline float uitofp_i8_f32(uint8_t x)
{
    return x;
}
static inline float uitofp_i16_f32(uint16_t x)
{
    return x;
}
static inline float uitofp_i32_f32(uint32_t x)
{
    return x;
}
static inline float uitofp_i64_f32(uint64_t x)
{
    return x;
}
static inline int8_t fptosi_f32_i8(float x)
{
    return x;
}
static inline int16_t fptosi_f32_i16(float x)
{
    return x;
}
static inline int32_t fptosi_f32_i32(float x)
{
    return x;
}
static inline int64_t fptosi_f32_i64(float x)
{
    return x;
}
static inline uint8_t fptoui_f32_i8(float x)
{
    return x;
}
static inline uint16_t fptoui_f32_i16(float x)
{
    return x;
}
static inline uint32_t fptoui_f32_i32(float x)
{
    return x;
}
static inline uint64_t fptoui_f32_i64(float x)
{
    return x;
}
static inline float futrts_log32(float x)
{
    return log(x);
}
static inline float futrts_log2_32(float x)
{
    return log2(x);
}
static inline float futrts_log10_32(float x)
{
    return log10(x);
}
static inline float futrts_sqrt32(float x)
{
    return sqrt(x);
}
static inline float futrts_exp32(float x)
{
    return exp(x);
}
static inline float futrts_cos32(float x)
{
    return cos(x);
}
static inline float futrts_sin32(float x)
{
    return sin(x);
}
static inline float futrts_tan32(float x)
{
    return tan(x);
}
static inline float futrts_acos32(float x)
{
    return acos(x);
}
static inline float futrts_asin32(float x)
{
    return asin(x);
}
static inline float futrts_atan32(float x)
{
    return atan(x);
}
static inline float futrts_atan2_32(float x, float y)
{
    return atan2(x, y);
}
static inline float futrts_round32(float x)
{
    return rint(x);
}
static inline char futrts_isnan32(float x)
{
    return isnan(x);
}
static inline char futrts_isinf32(float x)
{
    return isinf(x);
}
static inline int32_t futrts_to_bits32(float x)
{
    union {
        float f;
        int32_t t;
    } p;
    
    p.f = x;
    return p.t;
}
static inline float futrts_from_bits32(int32_t x)
{
    union {
        int32_t f;
        float t;
    } p;
    
    p.f = x;
    return p.t;
}
__kernel void copy_38706(int32_t sizze_31215, int32_t res_31237,
                         int32_t j_31369, int32_t j_m_i_31370, __global
                         unsigned char *mem_38023, __global
                         unsigned char *mem_38036)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t copy_gtid_38706;
    int32_t copy_ltid_38707;
    int32_t copy_gid_38708;
    
    copy_gtid_38706 = get_global_id(0);
    copy_ltid_38707 = get_local_id(0);
    copy_gid_38708 = get_group_id(0);
    if (slt32(copy_gtid_38706, sizze_31215 * (res_31237 * j_m_i_31370))) {
        *(__global float *) &mem_38036[(squot32(copy_gtid_38706, res_31237 *
                                                j_m_i_31370) * (j_m_i_31370 *
                                                                res_31237) +
                                        squot32(copy_gtid_38706 -
                                                squot32(copy_gtid_38706,
                                                        res_31237 *
                                                        j_m_i_31370) *
                                                (res_31237 * j_m_i_31370),
                                                j_m_i_31370) * j_m_i_31370 +
                                        (copy_gtid_38706 -
                                         squot32(copy_gtid_38706, res_31237 *
                                                 j_m_i_31370) * (res_31237 *
                                                                 j_m_i_31370) -
                                         squot32(copy_gtid_38706 -
                                                 squot32(copy_gtid_38706,
                                                         res_31237 *
                                                         j_m_i_31370) *
                                                 (res_31237 * j_m_i_31370),
                                                 j_m_i_31370) * j_m_i_31370)) *
                                       4] = *(__global
                                              float *) &mem_38023[(res_31237 +
                                                                   (squot32(copy_gtid_38706,
                                                                            res_31237 *
                                                                            j_m_i_31370) *
                                                                    (j_31369 *
                                                                     res_31237) +
                                                                    squot32(copy_gtid_38706 -
                                                                            squot32(copy_gtid_38706,
                                                                                    res_31237 *
                                                                                    j_m_i_31370) *
                                                                            (res_31237 *
                                                                             j_m_i_31370),
                                                                            j_m_i_31370) *
                                                                    j_31369 +
                                                                    (copy_gtid_38706 -
                                                                     squot32(copy_gtid_38706,
                                                                             res_31237 *
                                                                             j_m_i_31370) *
                                                                     (res_31237 *
                                                                      j_m_i_31370) -
                                                                     squot32(copy_gtid_38706 -
                                                                             squot32(copy_gtid_38706,
                                                                                     res_31237 *
                                                                                     j_m_i_31370) *
                                                                             (res_31237 *
                                                                              j_m_i_31370),
                                                                             j_m_i_31370) *
                                                                     j_m_i_31370))) *
                                                                  4];
    }
}
__kernel void copy_38772(int32_t sizze_31215, int32_t res_31237,
                         int32_t j_m_i_31370, __global
                         unsigned char *res_mem_38037, __global
                         unsigned char *mem_38095)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t copy_gtid_38772;
    int32_t copy_ltid_38773;
    int32_t copy_gid_38774;
    
    copy_gtid_38772 = get_global_id(0);
    copy_ltid_38773 = get_local_id(0);
    copy_gid_38774 = get_group_id(0);
    if (slt32(copy_gtid_38772, sizze_31215 * (res_31237 * j_m_i_31370))) {
        *(__global float *) &mem_38095[((copy_gtid_38772 -
                                         squot32(copy_gtid_38772, res_31237 *
                                                 j_m_i_31370) * (res_31237 *
                                                                 j_m_i_31370) -
                                         squot32(copy_gtid_38772 -
                                                 squot32(copy_gtid_38772,
                                                         res_31237 *
                                                         j_m_i_31370) *
                                                 (res_31237 * j_m_i_31370),
                                                 j_m_i_31370) * j_m_i_31370) *
                                        (sizze_31215 * res_31237) +
                                        squot32(copy_gtid_38772 -
                                                squot32(copy_gtid_38772,
                                                        res_31237 *
                                                        j_m_i_31370) *
                                                (res_31237 * j_m_i_31370),
                                                j_m_i_31370) * sizze_31215 +
                                        squot32(copy_gtid_38772, res_31237 *
                                                j_m_i_31370)) * 4] = *(__global
                                                                       float *) &res_mem_38037[(squot32(copy_gtid_38772,
                                                                                                        res_31237 *
                                                                                                        j_m_i_31370) *
                                                                                                (j_m_i_31370 *
                                                                                                 res_31237) +
                                                                                                squot32(copy_gtid_38772 -
                                                                                                        squot32(copy_gtid_38772,
                                                                                                                res_31237 *
                                                                                                                j_m_i_31370) *
                                                                                                        (res_31237 *
                                                                                                         j_m_i_31370),
                                                                                                        j_m_i_31370) *
                                                                                                j_m_i_31370 +
                                                                                                (copy_gtid_38772 -
                                                                                                 squot32(copy_gtid_38772,
                                                                                                         res_31237 *
                                                                                                         j_m_i_31370) *
                                                                                                 (res_31237 *
                                                                                                  j_m_i_31370) -
                                                                                                 squot32(copy_gtid_38772 -
                                                                                                         squot32(copy_gtid_38772,
                                                                                                                 res_31237 *
                                                                                                                 j_m_i_31370) *
                                                                                                         (res_31237 *
                                                                                                          j_m_i_31370),
                                                                                                         j_m_i_31370) *
                                                                                                 j_m_i_31370)) *
                                                                                               4];
    }
}
__kernel void copy_38968(int32_t sizze_31214, int32_t sizze_31215,
                         int32_t i_31490, __global unsigned char *mem_38270,
                         __global unsigned char *mem_38277)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t copy_gtid_38968;
    int32_t copy_ltid_38969;
    int32_t copy_gid_38970;
    
    copy_gtid_38968 = get_global_id(0);
    copy_ltid_38969 = get_local_id(0);
    copy_gid_38970 = get_group_id(0);
    if (slt32(copy_gtid_38968, sizze_31215)) {
        *(__global int32_t *) &mem_38277[copy_gtid_38968 * 4] = *(__global
                                                                  int32_t *) &mem_38270[(i_31490 +
                                                                                         copy_gtid_38968 *
                                                                                         sizze_31214) *
                                                                                        4];
    }
}
__kernel void map_31914(int32_t sizze_31200, int32_t sizze_31201,
                        int32_t sizze_31202, int16_t nan_value_31203, __global
                        unsigned char *images_mem_37893, __global
                        unsigned char *mem_37898)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_31914;
    int32_t local_tid_31915;
    int32_t group_sizze_38552;
    int32_t wave_sizze_38551;
    int32_t group_id_31916;
    
    global_tid_31914 = get_global_id(0);
    local_tid_31915 = get_local_id(0);
    group_sizze_38552 = get_local_size(0);
    wave_sizze_38551 = LOCKSTEP_WIDTH;
    group_id_31916 = get_group_id(0);
    
    int32_t gtid_31903;
    int32_t gtid_31904;
    int32_t gtid_31905;
    
    gtid_31903 = squot32(global_tid_31914, sizze_31201 * sizze_31202);
    gtid_31904 = squot32(global_tid_31914 - squot32(global_tid_31914,
                                                    sizze_31201 * sizze_31202) *
                         (sizze_31201 * sizze_31202), sizze_31202);
    gtid_31905 = global_tid_31914 - squot32(global_tid_31914, sizze_31201 *
                                            sizze_31202) * (sizze_31201 *
                                                            sizze_31202) -
        squot32(global_tid_31914 - squot32(global_tid_31914, sizze_31201 *
                                           sizze_31202) * (sizze_31201 *
                                                           sizze_31202),
                sizze_31202) * sizze_31202;
    
    int16_t x_31977;
    bool cond_31978;
    float res_31979;
    
    if ((slt32(gtid_31903, sizze_31200) && slt32(gtid_31904, sizze_31201)) &&
        slt32(gtid_31905, sizze_31202)) {
        x_31977 = *(__global int16_t *) &images_mem_37893[(gtid_31903 *
                                                           (sizze_31202 *
                                                            sizze_31201) +
                                                           gtid_31904 *
                                                           sizze_31202 +
                                                           gtid_31905) * 2];
        cond_31978 = x_31977 == nan_value_31203;
        if (cond_31978) {
            res_31979 = NAN;
        } else {
            float res_31980 = sitofp_i16_f32(x_31977);
            
            res_31979 = res_31980;
        }
    }
    if ((slt32(gtid_31903, sizze_31200) && slt32(gtid_31904, sizze_31201)) &&
        slt32(gtid_31905, sizze_31202)) {
        *(__global float *) &mem_37898[(gtid_31903 * (sizze_31202 *
                                                      sizze_31201) +
                                        gtid_31904 * sizze_31202 + gtid_31905) *
                                       4] = res_31979;
    }
}
__kernel void map_32091(int32_t sizze_31214, float freq_31220,
                        int32_t res_31237, __global
                        unsigned char *mappingindices_mem_37893, __global
                        unsigned char *mem_37898)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_32091;
    int32_t local_tid_32092;
    int32_t group_sizze_38580;
    int32_t wave_sizze_38579;
    int32_t group_id_32093;
    
    global_tid_32091 = get_global_id(0);
    local_tid_32092 = get_local_id(0);
    group_sizze_38580 = get_local_size(0);
    wave_sizze_38579 = LOCKSTEP_WIDTH;
    group_id_32093 = get_group_id(0);
    
    int32_t gtid_32082;
    int32_t gtid_32083;
    
    gtid_32082 = squot32(global_tid_32091, sizze_31214);
    gtid_32083 = global_tid_32091 - squot32(global_tid_32091, sizze_31214) *
        sizze_31214;
    
    bool index_primexp_37149;
    bool index_primexp_37148;
    int32_t cmpop_x_37146;
    bool index_primexp_37147;
    int32_t convop_x_37143;
    float binop_y_37144;
    float index_primexp_37145;
    int32_t x_32182;
    float res_32183;
    
    if (slt32(gtid_32082, res_31237) && slt32(gtid_32083, sizze_31214)) {
        index_primexp_37149 = gtid_32082 == 0;
        index_primexp_37148 = gtid_32082 == 1;
        cmpop_x_37146 = smod32(gtid_32082, 2);
        index_primexp_37147 = cmpop_x_37146 == 0;
        convop_x_37143 = sdiv32(gtid_32082, 2);
        binop_y_37144 = sitofp_i32_f32(convop_x_37143);
        index_primexp_37145 = 6.2831855F * binop_y_37144;
        x_32182 = *(__global int32_t *) &mappingindices_mem_37893[gtid_32083 *
                                                                  4];
        if (index_primexp_37149) {
            res_32183 = 1.0F;
        } else {
            float res_32184;
            
            if (index_primexp_37148) {
                float res_32185 = sitofp_i32_f32(x_32182);
                
                res_32184 = res_32185;
            } else {
                float res_32186;
                float x_32187;
                float res_32188;
                float res_32189;
                
                res_32186 = sitofp_i32_f32(x_32182);
                x_32187 = res_32186 * index_primexp_37145;
                res_32188 = x_32187 / freq_31220;
                if (index_primexp_37147) {
                    float res_32190;
                    
                    res_32190 = futrts_sin32(res_32188);
                    res_32189 = res_32190;
                } else {
                    float res_32191;
                    
                    res_32191 = futrts_cos32(res_32188);
                    res_32189 = res_32191;
                }
                res_32184 = res_32189;
            }
            res_32183 = res_32184;
        }
    }
    if (slt32(gtid_32082, res_31237) && slt32(gtid_32083, sizze_31214)) {
        *(__global float *) &mem_37898[(gtid_32082 * sizze_31214 + gtid_32083) *
                                       4] = res_32183;
    }
}
__kernel void map_32293(int32_t sizze_31214, float freq_31220,
                        int32_t res_31237, __global
                        unsigned char *mappingindices_mem_37893, __global
                        unsigned char *mem_37902)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_32293;
    int32_t local_tid_32294;
    int32_t group_sizze_38582;
    int32_t wave_sizze_38581;
    int32_t group_id_32295;
    
    global_tid_32293 = get_global_id(0);
    local_tid_32294 = get_local_id(0);
    group_sizze_38582 = get_local_size(0);
    wave_sizze_38581 = LOCKSTEP_WIDTH;
    group_id_32295 = get_group_id(0);
    
    int32_t gtid_32284;
    int32_t gtid_32285;
    
    gtid_32284 = squot32(global_tid_32293, sizze_31214);
    gtid_32285 = global_tid_32293 - squot32(global_tid_32293, sizze_31214) *
        sizze_31214;
    
    bool index_primexp_37157;
    int32_t binop_x_37154;
    int32_t cmpop_x_37155;
    bool index_primexp_37156;
    int32_t convop_x_37151;
    float binop_y_37152;
    float index_primexp_37153;
    int32_t x_32376;
    float res_32377;
    
    if (slt32(gtid_32284, res_31237) && slt32(gtid_32285, sizze_31214)) {
        index_primexp_37157 = gtid_32284 == 0;
        binop_x_37154 = 1 + gtid_32284;
        cmpop_x_37155 = smod32(binop_x_37154, 2);
        index_primexp_37156 = cmpop_x_37155 == 0;
        convop_x_37151 = sdiv32(binop_x_37154, 2);
        binop_y_37152 = sitofp_i32_f32(convop_x_37151);
        index_primexp_37153 = 6.2831855F * binop_y_37152;
        x_32376 = *(__global int32_t *) &mappingindices_mem_37893[gtid_32285 *
                                                                  4];
        if (index_primexp_37157) {
            res_32377 = 1.0F;
        } else {
            float res_32378;
            float x_32379;
            float res_32380;
            float res_32381;
            
            res_32378 = sitofp_i32_f32(x_32376);
            x_32379 = res_32378 * index_primexp_37153;
            res_32380 = x_32379 / freq_31220;
            if (index_primexp_37156) {
                float res_32382;
                
                res_32382 = futrts_sin32(res_32380);
                res_32381 = res_32382;
            } else {
                float res_32383;
                
                res_32383 = futrts_cos32(res_32380);
                res_32381 = res_32383;
            }
            res_32377 = res_32381;
        }
    }
    if (slt32(gtid_32284, res_31237) && slt32(gtid_32285, sizze_31214)) {
        *(__global float *) &mem_37902[(gtid_32284 * sizze_31214 + gtid_32285) *
                                       4] = res_32377;
    }
}
__kernel void map_32447(int32_t sizze_31214, int32_t res_31237, float res_31310,
                        __global unsigned char *mem_37907, __global
                        unsigned char *mem_37911)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_32447;
    int32_t local_tid_32448;
    int32_t group_sizze_38584;
    int32_t wave_sizze_38583;
    int32_t group_id_32449;
    
    global_tid_32447 = get_global_id(0);
    local_tid_32448 = get_local_id(0);
    group_sizze_38584 = get_local_size(0);
    wave_sizze_38583 = LOCKSTEP_WIDTH;
    group_id_32449 = get_group_id(0);
    
    int32_t gtid_32438;
    int32_t gtid_32439;
    
    gtid_32438 = squot32(global_tid_32447, res_31237);
    gtid_32439 = global_tid_32447 - squot32(global_tid_32447, res_31237) *
        res_31237;
    
    float x_32475;
    float res_32476;
    
    if (slt32(gtid_32438, sizze_31214) && slt32(gtid_32439, res_31237)) {
        x_32475 = *(__global float *) &mem_37907[(gtid_32438 * res_31237 +
                                                  gtid_32439) * 4];
        res_32476 = res_31310 + x_32475;
    }
    if (slt32(gtid_32438, sizze_31214) && slt32(gtid_32439, res_31237)) {
        *(__global float *) &mem_37911[(gtid_32438 * res_31237 + gtid_32439) *
                                       4] = res_32476;
    }
}
__kernel void map_32514(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t n_31219, int32_t res_31237, __global
                        unsigned char *arg_mem_37903, __global
                        unsigned char *mem_37911, __global
                        unsigned char *mem_37915, __global
                        unsigned char *mem_37919, __global
                        unsigned char *mem_37930)
{
    const int32_t group_sizze_32543 = mainzigroup_sizze_32508;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_32514;
    int32_t local_tid_32515;
    int32_t group_sizze_38586;
    int32_t wave_sizze_38585;
    int32_t group_id_32516;
    
    global_tid_32514 = get_global_id(0);
    local_tid_32515 = get_local_id(0);
    group_sizze_38586 = get_local_size(0);
    wave_sizze_38585 = LOCKSTEP_WIDTH;
    group_id_32516 = get_group_id(0);
    
    int32_t gtid_32507;
    
    gtid_32507 = global_tid_32514;
    if (slt32(gtid_32507, sizze_31215)) {
        for (int32_t i_32553 = 0; i_32553 < res_31237; i_32553++) {
            for (int32_t i_32558 = 0; i_32558 < res_31237; i_32558++) {
                float res_32560;
                float redout_32561 = 0.0F;
                
                for (int32_t i_32562 = 0; i_32562 < n_31219; i_32562++) {
                    float x_32563;
                    float x_32564;
                    float x_32565;
                    float x_32566;
                    bool res_32567;
                    float y_32568;
                    float res_32569;
                    float res_32572;
                    
                    x_32563 = *(__global float *) &mem_37915[(i_32562 *
                                                              sizze_31215 +
                                                              gtid_32507) * 4];
                    x_32564 = *(__global float *) &arg_mem_37903[(i_32553 *
                                                                  sizze_31214 +
                                                                  i_32562) * 4];
                    x_32565 = *(__global float *) &mem_37911[(i_32562 *
                                                              res_31237 +
                                                              i_32558) * 4];
                    x_32566 = x_32564 * x_32565;
                    res_32567 = futrts_isnan32(x_32563);
                    if (res_32567) {
                        y_32568 = 0.0F;
                    } else {
                        y_32568 = 1.0F;
                    }
                    res_32569 = x_32566 * y_32568;
                    res_32572 = redout_32561 + res_32569;
                    
                    float redout_tmp_38589 = res_32572;
                    
                    redout_32561 = redout_tmp_38589;
                }
                res_32560 = redout_32561;
                *(__global float *) &mem_37919[(group_id_32516 *
                                                (group_sizze_32543 * res_31237 *
                                                 res_31237) + local_tid_32515 +
                                                i_32553 * (group_sizze_32543 *
                                                           res_31237) +
                                                i_32558 * group_sizze_32543) *
                                               4] = res_32560;
            }
        }
    }
    if (slt32(gtid_32507, sizze_31215)) {
        for (int32_t i_38590 = 0; i_38590 < res_31237; i_38590++) {
            for (int32_t i_38591 = 0; i_38591 < res_31237; i_38591++) {
                *(__global float *) &mem_37930[(sizze_31215 * res_31237 * 0 +
                                                gtid_32507 + (i_38590 *
                                                              (sizze_31215 *
                                                               res_31237) +
                                                              i_38591 *
                                                              sizze_31215)) *
                                               4] = *(__global
                                                      float *) &mem_37919[(group_id_32516 *
                                                                           (group_sizze_32543 *
                                                                            res_31237 *
                                                                            res_31237) +
                                                                           local_tid_32515 +
                                                                           (i_38590 *
                                                                            (group_sizze_32543 *
                                                                             res_31237) +
                                                                            i_38591 *
                                                                            group_sizze_32543)) *
                                                                          4];
            }
        }
    }
}
__kernel void map_32618(int32_t sizze_31215, int32_t sizze_31216,
                        int32_t n_31219, int32_t res_31237, __global
                        unsigned char *images_mem_37894, __global
                        unsigned char *mem_37907, __global
                        unsigned char *mem_37911, __global
                        unsigned char *mem_37951, __global
                        unsigned char *mem_37956)
{
    const int32_t group_sizze_32971 = mainzigroup_sizze_32612;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_32618;
    int32_t local_tid_32619;
    int32_t group_sizze_38601;
    int32_t wave_sizze_38600;
    int32_t group_id_32620;
    
    global_tid_32618 = get_global_id(0);
    local_tid_32619 = get_local_id(0);
    group_sizze_38601 = get_local_size(0);
    wave_sizze_38600 = LOCKSTEP_WIDTH;
    group_id_32620 = get_group_id(0);
    
    int32_t gtid_32609;
    int32_t gtid_32610;
    
    gtid_32609 = squot32(global_tid_32618, res_31237);
    gtid_32610 = global_tid_32618 - squot32(global_tid_32618, res_31237) *
        res_31237;
    if (slt32(gtid_32609, sizze_31215) && slt32(gtid_32610, res_31237)) {
        for (int32_t i_32990 = 0; i_32990 < res_31237; i_32990++) {
            float res_32992;
            float redout_32993 = 0.0F;
            
            for (int32_t i_32994 = 0; i_32994 < n_31219; i_32994++) {
                float x_32995;
                float x_32996;
                float x_32997;
                float x_32998;
                bool res_32999;
                float y_33000;
                float res_33001;
                float res_33004;
                
                x_32995 = *(__global float *) &images_mem_37894[(gtid_32609 *
                                                                 sizze_31216 +
                                                                 i_32994) * 4];
                x_32996 = *(__global float *) &mem_37907[(i_32994 * res_31237 +
                                                          gtid_32610) * 4];
                x_32997 = *(__global float *) &mem_37911[(i_32994 * res_31237 +
                                                          i_32990) * 4];
                x_32998 = x_32996 * x_32997;
                res_32999 = futrts_isnan32(x_32995);
                if (res_32999) {
                    y_33000 = 0.0F;
                } else {
                    y_33000 = 1.0F;
                }
                res_33001 = x_32998 * y_33000;
                res_33004 = redout_32993 + res_33001;
                
                float redout_tmp_38603 = res_33004;
                
                redout_32993 = redout_tmp_38603;
            }
            res_32992 = redout_32993;
            *(__global float *) &mem_37951[(group_id_32620 *
                                            (group_sizze_32971 * res_31237) +
                                            local_tid_32619 + i_32990 *
                                            group_sizze_32971) * 4] = res_32992;
        }
    }
    if (slt32(gtid_32609, sizze_31215) && slt32(gtid_32610, res_31237)) {
        for (int32_t i_38604 = 0; i_38604 < res_31237; i_38604++) {
            *(__global float *) &mem_37956[(res_31237 * sizze_31215 * 0 +
                                            gtid_32609 * res_31237 +
                                            gtid_32610 + i_38604 * (res_31237 *
                                                                    sizze_31215)) *
                                           4] = *(__global
                                                  float *) &mem_37951[(group_id_32620 *
                                                                       (group_sizze_32971 *
                                                                        res_31237) +
                                                                       local_tid_32619 +
                                                                       i_38604 *
                                                                       group_sizze_32971) *
                                                                      4];
        }
    }
}
__kernel void map_32725(__local volatile int64_t *mem_37981_backing_aligned_0,
                        int32_t sizze_31215, int32_t n_31219, int32_t res_31237,
                        int32_t gidzz_range_37200, int32_t tile_sizze_x_37204,
                        int32_t tiled_group_sizze_37206, __global
                        unsigned char *mem_37907, __global
                        unsigned char *mem_37911, __global
                        unsigned char *mem_37974, __global
                        unsigned char *mem_37978)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_37981_backing_0 =
                          mem_37981_backing_aligned_0;
    int32_t global_tid_32725;
    int32_t local_tid_32726;
    int32_t group_sizze_38612;
    int32_t wave_sizze_38611;
    int32_t group_id_32727;
    
    global_tid_32725 = get_global_id(0);
    local_tid_32726 = get_local_id(0);
    group_sizze_38612 = get_local_size(0);
    wave_sizze_38611 = LOCKSTEP_WIDTH;
    group_id_32727 = get_group_id(0);
    
    int32_t gtid_32714;
    int32_t gtid_32715;
    int32_t gtid_32716;
    int32_t ltid_37207;
    int32_t ltid_37208;
    int32_t ltid_37209;
    
    gtid_32714 = squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                tile_sizze_x_37204), tile_sizze_x_37204 *
                         tile_sizze_x_37204) + squot32(squot32(global_tid_32725,
                                                               tile_sizze_x_37204 *
                                                               tile_sizze_x_37204),
                                                       squot32(res_31237 +
                                                               tile_sizze_x_37204 -
                                                               1,
                                                               tile_sizze_x_37204) *
                                                       squot32(res_31237 +
                                                               tile_sizze_x_37204 -
                                                               1,
                                                               tile_sizze_x_37204));
    gtid_32715 = squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                tile_sizze_x_37204) -
                         squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                        tile_sizze_x_37204),
                                 tile_sizze_x_37204 * tile_sizze_x_37204) *
                         (tile_sizze_x_37204 * tile_sizze_x_37204),
                         tile_sizze_x_37204) + squot32(squot32(global_tid_32725,
                                                               tile_sizze_x_37204 *
                                                               tile_sizze_x_37204) -
                                                       squot32(squot32(global_tid_32725,
                                                                       tile_sizze_x_37204 *
                                                                       tile_sizze_x_37204),
                                                               squot32(res_31237 +
                                                                       tile_sizze_x_37204 -
                                                                       1,
                                                                       tile_sizze_x_37204) *
                                                               squot32(res_31237 +
                                                                       tile_sizze_x_37204 -
                                                                       1,
                                                                       tile_sizze_x_37204)) *
                                                       (squot32(res_31237 +
                                                                tile_sizze_x_37204 -
                                                                1,
                                                                tile_sizze_x_37204) *
                                                        squot32(res_31237 +
                                                                tile_sizze_x_37204 -
                                                                1,
                                                                tile_sizze_x_37204)),
                                                       squot32(res_31237 +
                                                               tile_sizze_x_37204 -
                                                               1,
                                                               tile_sizze_x_37204)) *
        tile_sizze_x_37204;
    gtid_32716 = srem32(global_tid_32725, tile_sizze_x_37204 *
                        tile_sizze_x_37204) - squot32(srem32(global_tid_32725,
                                                             tile_sizze_x_37204 *
                                                             tile_sizze_x_37204),
                                                      tile_sizze_x_37204 *
                                                      tile_sizze_x_37204) *
        (tile_sizze_x_37204 * tile_sizze_x_37204) -
        squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                       tile_sizze_x_37204) - squot32(srem32(global_tid_32725,
                                                            tile_sizze_x_37204 *
                                                            tile_sizze_x_37204),
                                                     tile_sizze_x_37204 *
                                                     tile_sizze_x_37204) *
                (tile_sizze_x_37204 * tile_sizze_x_37204), tile_sizze_x_37204) *
        tile_sizze_x_37204 + (squot32(global_tid_32725, tile_sizze_x_37204 *
                                      tile_sizze_x_37204) -
                              squot32(squot32(global_tid_32725,
                                              tile_sizze_x_37204 *
                                              tile_sizze_x_37204),
                                      squot32(res_31237 + tile_sizze_x_37204 -
                                              1, tile_sizze_x_37204) *
                                      squot32(res_31237 + tile_sizze_x_37204 -
                                              1, tile_sizze_x_37204)) *
                              (squot32(res_31237 + tile_sizze_x_37204 - 1,
                                       tile_sizze_x_37204) * squot32(res_31237 +
                                                                     tile_sizze_x_37204 -
                                                                     1,
                                                                     tile_sizze_x_37204)) -
                              squot32(squot32(global_tid_32725,
                                              tile_sizze_x_37204 *
                                              tile_sizze_x_37204) -
                                      squot32(squot32(global_tid_32725,
                                                      tile_sizze_x_37204 *
                                                      tile_sizze_x_37204),
                                              squot32(res_31237 +
                                                      tile_sizze_x_37204 - 1,
                                                      tile_sizze_x_37204) *
                                              squot32(res_31237 +
                                                      tile_sizze_x_37204 - 1,
                                                      tile_sizze_x_37204)) *
                                      (squot32(res_31237 + tile_sizze_x_37204 -
                                               1, tile_sizze_x_37204) *
                                       squot32(res_31237 + tile_sizze_x_37204 -
                                               1, tile_sizze_x_37204)),
                                      squot32(res_31237 + tile_sizze_x_37204 -
                                              1, tile_sizze_x_37204)) *
                              squot32(res_31237 + tile_sizze_x_37204 - 1,
                                      tile_sizze_x_37204)) * tile_sizze_x_37204;
    ltid_37207 = squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                tile_sizze_x_37204), tile_sizze_x_37204 *
                         tile_sizze_x_37204);
    ltid_37208 = squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                tile_sizze_x_37204) -
                         squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                                        tile_sizze_x_37204),
                                 tile_sizze_x_37204 * tile_sizze_x_37204) *
                         (tile_sizze_x_37204 * tile_sizze_x_37204),
                         tile_sizze_x_37204);
    ltid_37209 = srem32(global_tid_32725, tile_sizze_x_37204 *
                        tile_sizze_x_37204) - squot32(srem32(global_tid_32725,
                                                             tile_sizze_x_37204 *
                                                             tile_sizze_x_37204),
                                                      tile_sizze_x_37204 *
                                                      tile_sizze_x_37204) *
        (tile_sizze_x_37204 * tile_sizze_x_37204) -
        squot32(srem32(global_tid_32725, tile_sizze_x_37204 *
                       tile_sizze_x_37204) - squot32(srem32(global_tid_32725,
                                                            tile_sizze_x_37204 *
                                                            tile_sizze_x_37204),
                                                     tile_sizze_x_37204 *
                                                     tile_sizze_x_37204) *
                (tile_sizze_x_37204 * tile_sizze_x_37204), tile_sizze_x_37204) *
        tile_sizze_x_37204;
    
    int32_t mm_37197;
    int32_t m_37227;
    bool is_active_37814;
    bool is_active_37815;
    bool active_37817;
    
    if ((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                       res_31237)) &&
        slt32(gtid_32716, res_31237)) {
        mm_37197 = 30 * gtid_32714;
        m_37227 = local_tid_32726 + mm_37197;
        is_active_37814 = slt32(local_tid_32726, 30);
        is_active_37815 = slt32(m_37227, sizze_31215);
        active_37817 = is_active_37814 && is_active_37815;
    }
    
    __local char *mem_37981;
    
    mem_37981 = (__local char *) mem_37981_backing_0;
    
    float res_37563;
    float res_37564;
    float res_37565;
    float res_37566;
    float res_37567;
    float res_37568;
    float res_37569;
    float res_37570;
    float res_37571;
    float res_37572;
    float res_37573;
    float res_37574;
    float res_37575;
    float res_37576;
    float res_37577;
    float res_37578;
    float res_37579;
    float res_37580;
    float res_37581;
    float res_37582;
    float res_37583;
    float res_37584;
    float res_37585;
    float res_37586;
    float res_37587;
    float res_37588;
    float res_37589;
    float res_37590;
    float res_37591;
    float res_37592;
    int32_t m_37598;
    int32_t m_37601;
    int32_t m_37604;
    int32_t m_37607;
    int32_t m_37610;
    int32_t m_37613;
    int32_t m_37616;
    int32_t m_37619;
    int32_t m_37622;
    int32_t m_37625;
    int32_t m_37628;
    int32_t m_37631;
    int32_t m_37634;
    int32_t m_37637;
    int32_t m_37640;
    int32_t m_37643;
    int32_t m_37646;
    int32_t m_37649;
    int32_t m_37652;
    int32_t m_37655;
    int32_t m_37658;
    int32_t m_37661;
    int32_t m_37664;
    int32_t m_37667;
    int32_t m_37670;
    int32_t m_37673;
    int32_t m_37676;
    int32_t m_37679;
    int32_t m_37682;
    
    if ((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                       res_31237)) &&
        slt32(gtid_32716, res_31237)) {
        float acc_clone_37233;
        float acc_clone_37244;
        float acc_clone_37255;
        float acc_clone_37266;
        float acc_clone_37277;
        float acc_clone_37288;
        float acc_clone_37299;
        float acc_clone_37310;
        float acc_clone_37321;
        float acc_clone_37332;
        float acc_clone_37343;
        float acc_clone_37354;
        float acc_clone_37365;
        float acc_clone_37376;
        float acc_clone_37387;
        float acc_clone_37398;
        float acc_clone_37409;
        float acc_clone_37420;
        float acc_clone_37431;
        float acc_clone_37442;
        float acc_clone_37453;
        float acc_clone_37464;
        float acc_clone_37475;
        float acc_clone_37486;
        float acc_clone_37497;
        float acc_clone_37508;
        float acc_clone_37519;
        float acc_clone_37530;
        float acc_clone_37541;
        float acc_clone_37552;
        
        acc_clone_37233 = 0.0F;
        acc_clone_37244 = 0.0F;
        acc_clone_37255 = 0.0F;
        acc_clone_37266 = 0.0F;
        acc_clone_37277 = 0.0F;
        acc_clone_37288 = 0.0F;
        acc_clone_37299 = 0.0F;
        acc_clone_37310 = 0.0F;
        acc_clone_37321 = 0.0F;
        acc_clone_37332 = 0.0F;
        acc_clone_37343 = 0.0F;
        acc_clone_37354 = 0.0F;
        acc_clone_37365 = 0.0F;
        acc_clone_37376 = 0.0F;
        acc_clone_37387 = 0.0F;
        acc_clone_37398 = 0.0F;
        acc_clone_37409 = 0.0F;
        acc_clone_37420 = 0.0F;
        acc_clone_37431 = 0.0F;
        acc_clone_37442 = 0.0F;
        acc_clone_37453 = 0.0F;
        acc_clone_37464 = 0.0F;
        acc_clone_37475 = 0.0F;
        acc_clone_37486 = 0.0F;
        acc_clone_37497 = 0.0F;
        acc_clone_37508 = 0.0F;
        acc_clone_37519 = 0.0F;
        acc_clone_37530 = 0.0F;
        acc_clone_37541 = 0.0F;
        acc_clone_37552 = 0.0F;
        for (int32_t loop_ind_37562 = 0; loop_ind_37562 < n_31219;
             loop_ind_37562++) {
            int32_t i_33066;
            
            i_33066 = loop_ind_37562;
            barrier(CLK_LOCAL_MEM_FENCE);
            
            float x_33072;
            float x_33073;
            float x_33075;
            float x_33071;
            
            x_33072 = *(__global float *) &mem_37907[(i_33066 * res_31237 +
                                                      gtid_32715) * 4];
            x_33073 = *(__global float *) &mem_37911[(i_33066 * res_31237 +
                                                      gtid_32716) * 4];
            x_33075 = x_33072 * x_33073;
            if (active_37817) {
                float x_37818 = *(__global float *) &mem_37978[(i_33066 *
                                                                sizze_31215 +
                                                                m_37227) * 4];
                
                x_33071 = x_37818;
            } else {
                x_33071 = 0.0F;
            }
            for (int32_t comb_iter_38643 = 0; comb_iter_38643 < 1;
                 comb_iter_38643++) {
                int32_t cid_37231;
                int32_t flat_comb_id_38644 = comb_iter_38643 *
                        tiled_group_sizze_37206 + local_tid_32726;
                
                cid_37231 = flat_comb_id_38644;
                if (slt32(cid_37231, tiled_group_sizze_37206) &&
                    (slt32(local_tid_32726, 30) && slt32(m_37227,
                                                         sizze_31215))) {
                    *(__local float *) &mem_37981[cid_37231 * 4] = x_33071;
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            
            float x_37238;
            bool res_37239;
            float y_37240;
            float res_37241;
            float res_37242;
            float x_37249;
            bool res_37250;
            float y_37251;
            float res_37252;
            float res_37253;
            float x_37260;
            bool res_37261;
            float y_37262;
            float res_37263;
            float res_37264;
            float x_37271;
            bool res_37272;
            float y_37273;
            float res_37274;
            float res_37275;
            float x_37282;
            bool res_37283;
            float y_37284;
            float res_37285;
            float res_37286;
            float x_37293;
            bool res_37294;
            float y_37295;
            float res_37296;
            float res_37297;
            float x_37304;
            bool res_37305;
            float y_37306;
            float res_37307;
            float res_37308;
            float x_37315;
            bool res_37316;
            float y_37317;
            float res_37318;
            float res_37319;
            float x_37326;
            bool res_37327;
            float y_37328;
            float res_37329;
            float res_37330;
            float x_37337;
            bool res_37338;
            float y_37339;
            float res_37340;
            float res_37341;
            float x_37348;
            bool res_37349;
            float y_37350;
            float res_37351;
            float res_37352;
            float x_37359;
            bool res_37360;
            float y_37361;
            float res_37362;
            float res_37363;
            float x_37370;
            bool res_37371;
            float y_37372;
            float res_37373;
            float res_37374;
            float x_37381;
            bool res_37382;
            float y_37383;
            float res_37384;
            float res_37385;
            float x_37392;
            bool res_37393;
            float y_37394;
            float res_37395;
            float res_37396;
            float x_37403;
            bool res_37404;
            float y_37405;
            float res_37406;
            float res_37407;
            float x_37414;
            bool res_37415;
            float y_37416;
            float res_37417;
            float res_37418;
            float x_37425;
            bool res_37426;
            float y_37427;
            float res_37428;
            float res_37429;
            float x_37436;
            bool res_37437;
            float y_37438;
            float res_37439;
            float res_37440;
            float x_37447;
            bool res_37448;
            float y_37449;
            float res_37450;
            float res_37451;
            float x_37458;
            bool res_37459;
            float y_37460;
            float res_37461;
            float res_37462;
            float x_37469;
            bool res_37470;
            float y_37471;
            float res_37472;
            float res_37473;
            float x_37480;
            bool res_37481;
            float y_37482;
            float res_37483;
            float res_37484;
            float x_37491;
            bool res_37492;
            float y_37493;
            float res_37494;
            float res_37495;
            float x_37502;
            bool res_37503;
            float y_37504;
            float res_37505;
            float res_37506;
            float x_37513;
            bool res_37514;
            float y_37515;
            float res_37516;
            float res_37517;
            float x_37524;
            bool res_37525;
            float y_37526;
            float res_37527;
            float res_37528;
            float x_37535;
            bool res_37536;
            float y_37537;
            float res_37538;
            float res_37539;
            float x_37546;
            bool res_37547;
            float y_37548;
            float res_37549;
            float res_37550;
            float x_37557;
            bool res_37558;
            float y_37559;
            float res_37560;
            float res_37561;
            
            x_37238 = *(__local float *) &mem_37981[0];
            res_37239 = futrts_isnan32(x_37238);
            if (res_37239) {
                y_37240 = 0.0F;
            } else {
                y_37240 = 1.0F;
            }
            res_37241 = x_33075 * y_37240;
            res_37242 = acc_clone_37233 + res_37241;
            x_37249 = *(__local float *) &mem_37981[4];
            res_37250 = futrts_isnan32(x_37249);
            if (res_37250) {
                y_37251 = 0.0F;
            } else {
                y_37251 = 1.0F;
            }
            res_37252 = x_33075 * y_37251;
            res_37253 = acc_clone_37244 + res_37252;
            x_37260 = *(__local float *) &mem_37981[8];
            res_37261 = futrts_isnan32(x_37260);
            if (res_37261) {
                y_37262 = 0.0F;
            } else {
                y_37262 = 1.0F;
            }
            res_37263 = x_33075 * y_37262;
            res_37264 = acc_clone_37255 + res_37263;
            x_37271 = *(__local float *) &mem_37981[12];
            res_37272 = futrts_isnan32(x_37271);
            if (res_37272) {
                y_37273 = 0.0F;
            } else {
                y_37273 = 1.0F;
            }
            res_37274 = x_33075 * y_37273;
            res_37275 = acc_clone_37266 + res_37274;
            x_37282 = *(__local float *) &mem_37981[16];
            res_37283 = futrts_isnan32(x_37282);
            if (res_37283) {
                y_37284 = 0.0F;
            } else {
                y_37284 = 1.0F;
            }
            res_37285 = x_33075 * y_37284;
            res_37286 = acc_clone_37277 + res_37285;
            x_37293 = *(__local float *) &mem_37981[20];
            res_37294 = futrts_isnan32(x_37293);
            if (res_37294) {
                y_37295 = 0.0F;
            } else {
                y_37295 = 1.0F;
            }
            res_37296 = x_33075 * y_37295;
            res_37297 = acc_clone_37288 + res_37296;
            x_37304 = *(__local float *) &mem_37981[24];
            res_37305 = futrts_isnan32(x_37304);
            if (res_37305) {
                y_37306 = 0.0F;
            } else {
                y_37306 = 1.0F;
            }
            res_37307 = x_33075 * y_37306;
            res_37308 = acc_clone_37299 + res_37307;
            x_37315 = *(__local float *) &mem_37981[28];
            res_37316 = futrts_isnan32(x_37315);
            if (res_37316) {
                y_37317 = 0.0F;
            } else {
                y_37317 = 1.0F;
            }
            res_37318 = x_33075 * y_37317;
            res_37319 = acc_clone_37310 + res_37318;
            x_37326 = *(__local float *) &mem_37981[32];
            res_37327 = futrts_isnan32(x_37326);
            if (res_37327) {
                y_37328 = 0.0F;
            } else {
                y_37328 = 1.0F;
            }
            res_37329 = x_33075 * y_37328;
            res_37330 = acc_clone_37321 + res_37329;
            x_37337 = *(__local float *) &mem_37981[36];
            res_37338 = futrts_isnan32(x_37337);
            if (res_37338) {
                y_37339 = 0.0F;
            } else {
                y_37339 = 1.0F;
            }
            res_37340 = x_33075 * y_37339;
            res_37341 = acc_clone_37332 + res_37340;
            x_37348 = *(__local float *) &mem_37981[40];
            res_37349 = futrts_isnan32(x_37348);
            if (res_37349) {
                y_37350 = 0.0F;
            } else {
                y_37350 = 1.0F;
            }
            res_37351 = x_33075 * y_37350;
            res_37352 = acc_clone_37343 + res_37351;
            x_37359 = *(__local float *) &mem_37981[44];
            res_37360 = futrts_isnan32(x_37359);
            if (res_37360) {
                y_37361 = 0.0F;
            } else {
                y_37361 = 1.0F;
            }
            res_37362 = x_33075 * y_37361;
            res_37363 = acc_clone_37354 + res_37362;
            x_37370 = *(__local float *) &mem_37981[48];
            res_37371 = futrts_isnan32(x_37370);
            if (res_37371) {
                y_37372 = 0.0F;
            } else {
                y_37372 = 1.0F;
            }
            res_37373 = x_33075 * y_37372;
            res_37374 = acc_clone_37365 + res_37373;
            x_37381 = *(__local float *) &mem_37981[52];
            res_37382 = futrts_isnan32(x_37381);
            if (res_37382) {
                y_37383 = 0.0F;
            } else {
                y_37383 = 1.0F;
            }
            res_37384 = x_33075 * y_37383;
            res_37385 = acc_clone_37376 + res_37384;
            x_37392 = *(__local float *) &mem_37981[56];
            res_37393 = futrts_isnan32(x_37392);
            if (res_37393) {
                y_37394 = 0.0F;
            } else {
                y_37394 = 1.0F;
            }
            res_37395 = x_33075 * y_37394;
            res_37396 = acc_clone_37387 + res_37395;
            x_37403 = *(__local float *) &mem_37981[60];
            res_37404 = futrts_isnan32(x_37403);
            if (res_37404) {
                y_37405 = 0.0F;
            } else {
                y_37405 = 1.0F;
            }
            res_37406 = x_33075 * y_37405;
            res_37407 = acc_clone_37398 + res_37406;
            x_37414 = *(__local float *) &mem_37981[64];
            res_37415 = futrts_isnan32(x_37414);
            if (res_37415) {
                y_37416 = 0.0F;
            } else {
                y_37416 = 1.0F;
            }
            res_37417 = x_33075 * y_37416;
            res_37418 = acc_clone_37409 + res_37417;
            x_37425 = *(__local float *) &mem_37981[68];
            res_37426 = futrts_isnan32(x_37425);
            if (res_37426) {
                y_37427 = 0.0F;
            } else {
                y_37427 = 1.0F;
            }
            res_37428 = x_33075 * y_37427;
            res_37429 = acc_clone_37420 + res_37428;
            x_37436 = *(__local float *) &mem_37981[72];
            res_37437 = futrts_isnan32(x_37436);
            if (res_37437) {
                y_37438 = 0.0F;
            } else {
                y_37438 = 1.0F;
            }
            res_37439 = x_33075 * y_37438;
            res_37440 = acc_clone_37431 + res_37439;
            x_37447 = *(__local float *) &mem_37981[76];
            res_37448 = futrts_isnan32(x_37447);
            if (res_37448) {
                y_37449 = 0.0F;
            } else {
                y_37449 = 1.0F;
            }
            res_37450 = x_33075 * y_37449;
            res_37451 = acc_clone_37442 + res_37450;
            x_37458 = *(__local float *) &mem_37981[80];
            res_37459 = futrts_isnan32(x_37458);
            if (res_37459) {
                y_37460 = 0.0F;
            } else {
                y_37460 = 1.0F;
            }
            res_37461 = x_33075 * y_37460;
            res_37462 = acc_clone_37453 + res_37461;
            x_37469 = *(__local float *) &mem_37981[84];
            res_37470 = futrts_isnan32(x_37469);
            if (res_37470) {
                y_37471 = 0.0F;
            } else {
                y_37471 = 1.0F;
            }
            res_37472 = x_33075 * y_37471;
            res_37473 = acc_clone_37464 + res_37472;
            x_37480 = *(__local float *) &mem_37981[88];
            res_37481 = futrts_isnan32(x_37480);
            if (res_37481) {
                y_37482 = 0.0F;
            } else {
                y_37482 = 1.0F;
            }
            res_37483 = x_33075 * y_37482;
            res_37484 = acc_clone_37475 + res_37483;
            x_37491 = *(__local float *) &mem_37981[92];
            res_37492 = futrts_isnan32(x_37491);
            if (res_37492) {
                y_37493 = 0.0F;
            } else {
                y_37493 = 1.0F;
            }
            res_37494 = x_33075 * y_37493;
            res_37495 = acc_clone_37486 + res_37494;
            x_37502 = *(__local float *) &mem_37981[96];
            res_37503 = futrts_isnan32(x_37502);
            if (res_37503) {
                y_37504 = 0.0F;
            } else {
                y_37504 = 1.0F;
            }
            res_37505 = x_33075 * y_37504;
            res_37506 = acc_clone_37497 + res_37505;
            x_37513 = *(__local float *) &mem_37981[100];
            res_37514 = futrts_isnan32(x_37513);
            if (res_37514) {
                y_37515 = 0.0F;
            } else {
                y_37515 = 1.0F;
            }
            res_37516 = x_33075 * y_37515;
            res_37517 = acc_clone_37508 + res_37516;
            x_37524 = *(__local float *) &mem_37981[104];
            res_37525 = futrts_isnan32(x_37524);
            if (res_37525) {
                y_37526 = 0.0F;
            } else {
                y_37526 = 1.0F;
            }
            res_37527 = x_33075 * y_37526;
            res_37528 = acc_clone_37519 + res_37527;
            x_37535 = *(__local float *) &mem_37981[108];
            res_37536 = futrts_isnan32(x_37535);
            if (res_37536) {
                y_37537 = 0.0F;
            } else {
                y_37537 = 1.0F;
            }
            res_37538 = x_33075 * y_37537;
            res_37539 = acc_clone_37530 + res_37538;
            x_37546 = *(__local float *) &mem_37981[112];
            res_37547 = futrts_isnan32(x_37546);
            if (res_37547) {
                y_37548 = 0.0F;
            } else {
                y_37548 = 1.0F;
            }
            res_37549 = x_33075 * y_37548;
            res_37550 = acc_clone_37541 + res_37549;
            x_37557 = *(__local float *) &mem_37981[116];
            res_37558 = futrts_isnan32(x_37557);
            if (res_37558) {
                y_37559 = 0.0F;
            } else {
                y_37559 = 1.0F;
            }
            res_37560 = x_33075 * y_37559;
            res_37561 = acc_clone_37552 + res_37560;
            
            float acc_clone_tmp_38613 = res_37242;
            float acc_clone_tmp_38614 = res_37253;
            float acc_clone_tmp_38615 = res_37264;
            float acc_clone_tmp_38616 = res_37275;
            float acc_clone_tmp_38617 = res_37286;
            float acc_clone_tmp_38618 = res_37297;
            float acc_clone_tmp_38619 = res_37308;
            float acc_clone_tmp_38620 = res_37319;
            float acc_clone_tmp_38621 = res_37330;
            float acc_clone_tmp_38622 = res_37341;
            float acc_clone_tmp_38623 = res_37352;
            float acc_clone_tmp_38624 = res_37363;
            float acc_clone_tmp_38625 = res_37374;
            float acc_clone_tmp_38626 = res_37385;
            float acc_clone_tmp_38627 = res_37396;
            float acc_clone_tmp_38628 = res_37407;
            float acc_clone_tmp_38629 = res_37418;
            float acc_clone_tmp_38630 = res_37429;
            float acc_clone_tmp_38631 = res_37440;
            float acc_clone_tmp_38632 = res_37451;
            float acc_clone_tmp_38633 = res_37462;
            float acc_clone_tmp_38634 = res_37473;
            float acc_clone_tmp_38635 = res_37484;
            float acc_clone_tmp_38636 = res_37495;
            float acc_clone_tmp_38637 = res_37506;
            float acc_clone_tmp_38638 = res_37517;
            float acc_clone_tmp_38639 = res_37528;
            float acc_clone_tmp_38640 = res_37539;
            float acc_clone_tmp_38641 = res_37550;
            float acc_clone_tmp_38642;
            
            acc_clone_tmp_38642 = res_37561;
            acc_clone_37233 = acc_clone_tmp_38613;
            acc_clone_37244 = acc_clone_tmp_38614;
            acc_clone_37255 = acc_clone_tmp_38615;
            acc_clone_37266 = acc_clone_tmp_38616;
            acc_clone_37277 = acc_clone_tmp_38617;
            acc_clone_37288 = acc_clone_tmp_38618;
            acc_clone_37299 = acc_clone_tmp_38619;
            acc_clone_37310 = acc_clone_tmp_38620;
            acc_clone_37321 = acc_clone_tmp_38621;
            acc_clone_37332 = acc_clone_tmp_38622;
            acc_clone_37343 = acc_clone_tmp_38623;
            acc_clone_37354 = acc_clone_tmp_38624;
            acc_clone_37365 = acc_clone_tmp_38625;
            acc_clone_37376 = acc_clone_tmp_38626;
            acc_clone_37387 = acc_clone_tmp_38627;
            acc_clone_37398 = acc_clone_tmp_38628;
            acc_clone_37409 = acc_clone_tmp_38629;
            acc_clone_37420 = acc_clone_tmp_38630;
            acc_clone_37431 = acc_clone_tmp_38631;
            acc_clone_37442 = acc_clone_tmp_38632;
            acc_clone_37453 = acc_clone_tmp_38633;
            acc_clone_37464 = acc_clone_tmp_38634;
            acc_clone_37475 = acc_clone_tmp_38635;
            acc_clone_37486 = acc_clone_tmp_38636;
            acc_clone_37497 = acc_clone_tmp_38637;
            acc_clone_37508 = acc_clone_tmp_38638;
            acc_clone_37519 = acc_clone_tmp_38639;
            acc_clone_37530 = acc_clone_tmp_38640;
            acc_clone_37541 = acc_clone_tmp_38641;
            acc_clone_37552 = acc_clone_tmp_38642;
        }
        res_37563 = acc_clone_37233;
        res_37564 = acc_clone_37244;
        res_37565 = acc_clone_37255;
        res_37566 = acc_clone_37266;
        res_37567 = acc_clone_37277;
        res_37568 = acc_clone_37288;
        res_37569 = acc_clone_37299;
        res_37570 = acc_clone_37310;
        res_37571 = acc_clone_37321;
        res_37572 = acc_clone_37332;
        res_37573 = acc_clone_37343;
        res_37574 = acc_clone_37354;
        res_37575 = acc_clone_37365;
        res_37576 = acc_clone_37376;
        res_37577 = acc_clone_37387;
        res_37578 = acc_clone_37398;
        res_37579 = acc_clone_37409;
        res_37580 = acc_clone_37420;
        res_37581 = acc_clone_37431;
        res_37582 = acc_clone_37442;
        res_37583 = acc_clone_37453;
        res_37584 = acc_clone_37464;
        res_37585 = acc_clone_37475;
        res_37586 = acc_clone_37486;
        res_37587 = acc_clone_37497;
        res_37588 = acc_clone_37508;
        res_37589 = acc_clone_37519;
        res_37590 = acc_clone_37530;
        res_37591 = acc_clone_37541;
        res_37592 = acc_clone_37552;
        m_37598 = 1 + mm_37197;
        m_37601 = 2 + mm_37197;
        m_37604 = 3 + mm_37197;
        m_37607 = 4 + mm_37197;
        m_37610 = 5 + mm_37197;
        m_37613 = 6 + mm_37197;
        m_37616 = 7 + mm_37197;
        m_37619 = 8 + mm_37197;
        m_37622 = 9 + mm_37197;
        m_37625 = 10 + mm_37197;
        m_37628 = 11 + mm_37197;
        m_37631 = 12 + mm_37197;
        m_37634 = 13 + mm_37197;
        m_37637 = 14 + mm_37197;
        m_37640 = 15 + mm_37197;
        m_37643 = 16 + mm_37197;
        m_37646 = 17 + mm_37197;
        m_37649 = 18 + mm_37197;
        m_37652 = 19 + mm_37197;
        m_37655 = 20 + mm_37197;
        m_37658 = 21 + mm_37197;
        m_37661 = 22 + mm_37197;
        m_37664 = 23 + mm_37197;
        m_37667 = 24 + mm_37197;
        m_37670 = 25 + mm_37197;
        m_37673 = 26 + mm_37197;
        m_37676 = 27 + mm_37197;
        m_37679 = 28 + mm_37197;
        m_37682 = 29 + mm_37197;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, mm_37197) &&
                                             slt32(mm_37197, sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(mm_37197 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37563;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37598) && slt32(m_37598,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37598 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37564;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37601) && slt32(m_37601,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37601 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37565;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37604) && slt32(m_37604,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37604 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37566;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37607) && slt32(m_37607,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37607 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37567;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37610) && slt32(m_37610,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37610 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37568;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37613) && slt32(m_37613,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37613 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37569;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37616) && slt32(m_37616,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37616 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37570;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37619) && slt32(m_37619,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37619 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37571;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37622) && slt32(m_37622,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37622 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37572;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37625) && slt32(m_37625,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37625 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37573;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37628) && slt32(m_37628,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37628 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37574;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37631) && slt32(m_37631,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37631 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37575;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37634) && slt32(m_37634,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37634 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37576;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37637) && slt32(m_37637,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37637 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37577;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37640) && slt32(m_37640,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37640 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37578;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37643) && slt32(m_37643,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37643 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37579;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37646) && slt32(m_37646,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37646 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37580;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37649) && slt32(m_37649,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37649 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37581;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37652) && slt32(m_37652,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37652 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37582;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37655) && slt32(m_37655,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37655 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37583;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37658) && slt32(m_37658,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37658 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37584;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37661) && slt32(m_37661,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37661 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37585;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37664) && slt32(m_37664,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37664 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37586;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37667) && slt32(m_37667,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37667 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37587;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37670) && slt32(m_37670,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37670 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37588;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37673) && slt32(m_37673,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37673 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37589;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37676) && slt32(m_37676,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37676 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37590;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37679) && slt32(m_37679,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37679 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37591;
    }
    if (((((slt32(gtid_32714, gidzz_range_37200) && slt32(gtid_32715,
                                                          res_31237)) &&
           slt32(gtid_32716, res_31237)) && (sle32(0, m_37682) && slt32(m_37682,
                                                                        sizze_31215))) &&
         (sle32(0, gtid_32715) && slt32(gtid_32715, res_31237))) && (sle32(0,
                                                                           gtid_32716) &&
                                                                     slt32(gtid_32716,
                                                                           res_31237))) {
        *(__global float *) &mem_37974[(m_37682 * (res_31237 * res_31237) +
                                        gtid_32715 * res_31237 + gtid_32716) *
                                       4] = res_37592;
    }
}
__kernel void map_33492(int32_t sizze_31215, int32_t res_31373, __global
                        unsigned char *mem_38023, __global
                        unsigned char *mem_38030)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_33492;
    int32_t local_tid_33493;
    int32_t group_sizze_38705;
    int32_t wave_sizze_38704;
    int32_t group_id_33494;
    
    global_tid_33492 = get_global_id(0);
    local_tid_33493 = get_local_id(0);
    group_sizze_38705 = get_local_size(0);
    wave_sizze_38704 = LOCKSTEP_WIDTH;
    group_id_33494 = get_group_id(0);
    
    int32_t gtid_33483;
    int32_t gtid_33484;
    
    gtid_33483 = squot32(global_tid_33492, res_31373);
    gtid_33484 = global_tid_33492 - squot32(global_tid_33492, res_31373) *
        res_31373;
    
    float write_value_33816;
    
    if (slt32(gtid_33483, sizze_31215) && slt32(gtid_33484, res_31373)) {
        write_value_33816 = *(__global float *) &mem_38030[(gtid_33483 *
                                                            res_31373 +
                                                            gtid_33484) * 4];
    }
    if (((slt32(gtid_33483, sizze_31215) && slt32(gtid_33484, res_31373)) &&
         (sle32(0, gtid_33483) && slt32(gtid_33483, sizze_31215))) && (sle32(0,
                                                                             gtid_33484) &&
                                                                       slt32(gtid_33484,
                                                                             res_31373))) {
        *(__global float *) &mem_38023[(gtid_33483 * res_31373 + gtid_33484) *
                                       4] = write_value_33816;
    }
}
__kernel void map_33542(int32_t sizze_31215, int32_t m_31319, int32_t j_31369,
                        int32_t res_31373, int32_t i_33757, __global
                        unsigned char *mem_38023, __global
                        unsigned char *mem_38026, __global
                        unsigned char *mem_38030)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_33542;
    int32_t local_tid_33543;
    int32_t group_sizze_38703;
    int32_t wave_sizze_38702;
    int32_t group_id_33544;
    
    global_tid_33542 = get_global_id(0);
    local_tid_33543 = get_local_id(0);
    group_sizze_38703 = get_local_size(0);
    wave_sizze_38702 = LOCKSTEP_WIDTH;
    group_id_33544 = get_group_id(0);
    
    int32_t gtid_33533;
    int32_t gtid_33534;
    
    gtid_33533 = squot32(global_tid_33542, res_31373);
    gtid_33534 = global_tid_33542 - squot32(global_tid_33542, res_31373) *
        res_31373;
    
    float res_33784;
    bool cond_33785;
    int32_t res_33787;
    int32_t res_33788;
    float res_33789;
    
    if (slt32(gtid_33533, sizze_31215) && slt32(gtid_33534, res_31373)) {
        res_33784 = *(__global float *) &mem_38023[(gtid_33533 * res_31373 +
                                                    i_33757) * 4];
        cond_33785 = *(__global bool *) &mem_38026[gtid_33533];
        res_33787 = sdiv32(gtid_33534, j_31369);
        res_33788 = smod32(gtid_33534, j_31369);
        if (cond_33785) {
            int32_t x_33790;
            int32_t i_33791;
            float res_33792;
            
            x_33790 = j_31369 * res_33787;
            i_33791 = res_33788 + x_33790;
            res_33792 = *(__global float *) &mem_38023[(gtid_33533 * res_31373 +
                                                        i_33791) * 4];
            res_33789 = res_33792;
        } else {
            float x_33793;
            float res_33794;
            bool cond_33795;
            float res_33796;
            
            x_33793 = *(__global float *) &mem_38023[(gtid_33533 * res_31373 +
                                                      res_33788) * 4];
            res_33794 = x_33793 / res_33784;
            cond_33795 = slt32(res_33787, m_31319);
            if (cond_33795) {
                int32_t x_33797;
                int32_t x_33798;
                int32_t i_33799;
                float x_33800;
                int32_t i_33801;
                float x_33802;
                float y_33803;
                float res_33804;
                
                x_33797 = 1 + res_33787;
                x_33798 = j_31369 * x_33797;
                i_33799 = res_33788 + x_33798;
                x_33800 = *(__global float *) &mem_38023[(gtid_33533 *
                                                          res_31373 + i_33799) *
                                                         4];
                i_33801 = i_33757 + x_33798;
                x_33802 = *(__global float *) &mem_38023[(gtid_33533 *
                                                          res_31373 + i_33801) *
                                                         4];
                y_33803 = res_33794 * x_33802;
                res_33804 = x_33800 - y_33803;
                res_33796 = res_33804;
            } else {
                res_33796 = res_33794;
            }
            res_33789 = res_33796;
        }
    }
    if (slt32(gtid_33533, sizze_31215) && slt32(gtid_33534, res_31373)) {
        *(__global float *) &mem_38030[(gtid_33533 * res_31373 + gtid_33534) *
                                       4] = res_33789;
    }
}
__kernel void map_33603(int32_t sizze_31215, int32_t res_31373, int32_t i_33757,
                        __global unsigned char *mem_38023, __global
                        unsigned char *mem_38026)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_33603;
    int32_t local_tid_33604;
    int32_t group_sizze_38701;
    int32_t wave_sizze_38700;
    int32_t group_id_33605;
    
    global_tid_33603 = get_global_id(0);
    local_tid_33604 = get_local_id(0);
    group_sizze_38701 = get_local_size(0);
    wave_sizze_38700 = LOCKSTEP_WIDTH;
    group_id_33605 = get_group_id(0);
    
    int32_t gtid_33596;
    
    gtid_33596 = global_tid_33603;
    
    float res_33767;
    bool cond_33768;
    
    if (slt32(gtid_33596, sizze_31215)) {
        res_33767 = *(__global float *) &mem_38023[(gtid_33596 * res_31373 +
                                                    i_33757) * 4];
        cond_33768 = res_33767 == 0.0F;
    }
    if (slt32(gtid_33596, sizze_31215)) {
        *(__global bool *) &mem_38026[gtid_33596] = cond_33768;
    }
}
__kernel void map_33700(int32_t sizze_31215, int32_t res_31237, int32_t j_31369,
                        int32_t res_31373, __global
                        unsigned char *res_mem_38006, __global
                        unsigned char *mem_38023)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_33700;
    int32_t local_tid_33701;
    int32_t group_sizze_38698;
    int32_t wave_sizze_38697;
    int32_t group_id_33702;
    
    global_tid_33700 = get_global_id(0);
    local_tid_33701 = get_local_id(0);
    group_sizze_38698 = get_local_size(0);
    wave_sizze_38697 = LOCKSTEP_WIDTH;
    group_id_33702 = get_group_id(0);
    
    int32_t gtid_33691;
    int32_t gtid_33692;
    
    gtid_33691 = squot32(global_tid_33700, res_31373);
    gtid_33692 = global_tid_33700 - squot32(global_tid_33700, res_31373) *
        res_31373;
    
    int32_t res_33746;
    int32_t res_33747;
    bool cond_33748;
    float res_33749;
    
    if (slt32(gtid_33691, sizze_31215) && slt32(gtid_33692, res_31373)) {
        res_33746 = sdiv32(gtid_33692, j_31369);
        res_33747 = smod32(gtid_33692, j_31369);
        cond_33748 = slt32(res_33747, res_31237);
        if (cond_33748) {
            float res_33750 = *(__global float *) &res_mem_38006[(gtid_33691 *
                                                                  (res_31237 *
                                                                   res_31237) +
                                                                  res_33746 *
                                                                  res_31237 +
                                                                  res_33747) *
                                                                 4];
            
            res_33749 = res_33750;
        } else {
            int32_t y_33751;
            bool cond_33752;
            float res_33753;
            
            y_33751 = res_31237 + res_33746;
            cond_33752 = res_33747 == y_33751;
            if (cond_33752) {
                res_33753 = 1.0F;
            } else {
                res_33753 = 0.0F;
            }
            res_33749 = res_33753;
        }
    }
    if (slt32(gtid_33691, sizze_31215) && slt32(gtid_33692, res_31373)) {
        *(__global float *) &mem_38023[(gtid_33691 * res_31373 + gtid_33692) *
                                       4] = res_33749;
    }
}
__kernel void map_33861(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t n_31219, int32_t res_31237, __global
                        unsigned char *arg_mem_37903, __global
                        unsigned char *mem_38041, __global
                        unsigned char *mem_38044, __global
                        unsigned char *mem_38048)
{
    const int32_t group_sizze_33882 = mainzigroup_sizze_33855;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_33861;
    int32_t local_tid_33862;
    int32_t group_sizze_38712;
    int32_t wave_sizze_38711;
    int32_t group_id_33863;
    
    global_tid_33861 = get_global_id(0);
    local_tid_33862 = get_local_id(0);
    group_sizze_38712 = get_local_size(0);
    wave_sizze_38711 = LOCKSTEP_WIDTH;
    group_id_33863 = get_group_id(0);
    
    int32_t gtid_33854;
    
    gtid_33854 = global_tid_33861;
    if (slt32(gtid_33854, sizze_31215)) {
        for (int32_t i_33892 = 0; i_33892 < res_31237; i_33892++) {
            float res_33894;
            float redout_33895 = 0.0F;
            
            for (int32_t i_33896 = 0; i_33896 < n_31219; i_33896++) {
                float x_33897;
                float x_33898;
                bool res_33899;
                float res_33900;
                float res_33904;
                
                x_33897 = *(__global float *) &arg_mem_37903[(i_33892 *
                                                              sizze_31214 +
                                                              i_33896) * 4];
                x_33898 = *(__global float *) &mem_38041[(i_33896 *
                                                          sizze_31215 +
                                                          gtid_33854) * 4];
                res_33899 = futrts_isnan32(x_33898);
                if (res_33899) {
                    res_33900 = 0.0F;
                } else {
                    float res_33901 = x_33897 * x_33898;
                    
                    res_33900 = res_33901;
                }
                res_33904 = redout_33895 + res_33900;
                
                float redout_tmp_38714 = res_33904;
                
                redout_33895 = redout_tmp_38714;
            }
            res_33894 = redout_33895;
            *(__global float *) &mem_38044[(group_id_33863 *
                                            (group_sizze_33882 * res_31237) +
                                            local_tid_33862 + i_33892 *
                                            group_sizze_33882) * 4] = res_33894;
        }
    }
    if (slt32(gtid_33854, sizze_31215)) {
        for (int32_t i_38715 = 0; i_38715 < res_31237; i_38715++) {
            *(__global float *) &mem_38048[(gtid_33854 + i_38715 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38044[(group_id_33863 *
                                                                                       (group_sizze_33882 *
                                                                                        res_31237) +
                                                                                       local_tid_33862 +
                                                                                       i_38715 *
                                                                                       group_sizze_33882) *
                                                                                      4];
        }
    }
}
__kernel void map_33955(int32_t sizze_31215, int32_t sizze_31216,
                        int32_t n_31219, int32_t res_31237, __global
                        unsigned char *images_mem_37894, __global
                        unsigned char *mem_37907, __global
                        unsigned char *mem_38072)
{
    const int32_t tile_sizze_37710 = mainzitile_sizze_37709;
    const int32_t tiled_group_sizze_37711 = mainzitile_sizze_37709 *
                  mainzitile_sizze_37709;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(mem_38064_backing_0, 4 *
                         sext_i32_i64(mainzitile_sizze_37709 *
                         mainzitile_sizze_37709));
    ALIGNED_LOCAL_MEMORY(mem_38068_backing_1, 4 *
                         sext_i32_i64(mainzitile_sizze_37709 *
                         mainzitile_sizze_37709));
    
    int32_t global_tid_33955;
    int32_t local_tid_33956;
    int32_t group_sizze_38723;
    int32_t wave_sizze_38722;
    int32_t group_id_33957;
    
    global_tid_33955 = get_global_id(0);
    local_tid_33956 = get_local_id(0);
    group_sizze_38723 = get_local_size(0);
    wave_sizze_38722 = LOCKSTEP_WIDTH;
    group_id_33957 = get_group_id(0);
    
    int32_t gtid_33946;
    int32_t gtid_33947;
    int32_t ltid_37712;
    int32_t ltid_37713;
    
    gtid_33946 = squot32(srem32(global_tid_33955, tile_sizze_37710 *
                                tile_sizze_37710), tile_sizze_37710) +
        squot32(squot32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710),
                squot32(res_31237 + tile_sizze_37710 - 1, tile_sizze_37710)) *
        tile_sizze_37710;
    gtid_33947 = srem32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710) -
        squot32(srem32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710),
                tile_sizze_37710) * tile_sizze_37710 +
        (squot32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710) -
         squot32(squot32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710),
                 squot32(res_31237 + tile_sizze_37710 - 1, tile_sizze_37710)) *
         squot32(res_31237 + tile_sizze_37710 - 1, tile_sizze_37710)) *
        tile_sizze_37710;
    ltid_37712 = squot32(srem32(global_tid_33955, tile_sizze_37710 *
                                tile_sizze_37710), tile_sizze_37710);
    ltid_37713 = srem32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710) -
        squot32(srem32(global_tid_33955, tile_sizze_37710 * tile_sizze_37710),
                tile_sizze_37710) * tile_sizze_37710;
    if (slt32(gtid_33946, sizze_31215) && slt32(gtid_33947, res_31237)) { }
    
    __local char *mem_38064;
    __local char *mem_38068;
    float res_34093;
    
    mem_38064 = (__local char *) mem_38064_backing_0;
    mem_38068 = (__local char *) mem_38068_backing_1;
    
    float x_34096 = 0.0F;
    int32_t chunk_sizze_34094;
    int32_t chunk_offset_34095 = 0;
    
    while (slt32(chunk_offset_34095, n_31219)) {
        if (slt32(n_31219 - chunk_offset_34095, tile_sizze_37710)) {
            chunk_sizze_34094 = n_31219 - chunk_offset_34095;
        } else {
            chunk_sizze_34094 = tile_sizze_37710;
        }
        for (int32_t comb_iter_38724 = 0; comb_iter_38724 <
             squot32(tile_sizze_37710 * tile_sizze_37710 +
                     tiled_group_sizze_37711 - 1, tiled_group_sizze_37711);
             comb_iter_38724++) {
            int32_t cid_37725;
            int32_t cid_37726;
            int32_t flat_comb_id_38725 = comb_iter_38724 *
                    tiled_group_sizze_37711 + local_tid_33956;
            
            cid_37725 = squot32(flat_comb_id_38725, tile_sizze_37710);
            cid_37726 = flat_comb_id_38725 - squot32(flat_comb_id_38725,
                                                     tile_sizze_37710) *
                tile_sizze_37710;
            if ((slt32(cid_37725, chunk_sizze_34094) && slt32(cid_37726,
                                                              tile_sizze_37710)) &&
                slt32(gtid_33947, res_31237)) {
                float x_chunk_outer_elem_37724 = *(__global
                                                   float *) &mem_37907[(res_31237 *
                                                                        0 +
                                                                        gtid_33947 +
                                                                        res_31237 *
                                                                        chunk_offset_34095 +
                                                                        ltid_37712 *
                                                                        res_31237) *
                                                                       4];
                
                *(__local float *) &mem_38064[(cid_37725 * tile_sizze_37710 +
                                               cid_37726) * 4] =
                    x_chunk_outer_elem_37724;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(gtid_33946, sizze_31215) && slt32(gtid_33947, res_31237)) { }
        for (int32_t comb_iter_38726 = 0; comb_iter_38726 <
             squot32(tile_sizze_37710 * tile_sizze_37710 +
                     tiled_group_sizze_37711 - 1, tiled_group_sizze_37711);
             comb_iter_38726++) {
            int32_t cid_37730;
            int32_t cid_37731;
            int32_t flat_comb_id_38727 = comb_iter_38726 *
                    tiled_group_sizze_37711 + local_tid_33956;
            
            cid_37730 = squot32(flat_comb_id_38727, tile_sizze_37710);
            cid_37731 = flat_comb_id_38727 - squot32(flat_comb_id_38727,
                                                     tile_sizze_37710) *
                tile_sizze_37710;
            if ((slt32(cid_37730, tile_sizze_37710) && slt32(cid_37731,
                                                             chunk_sizze_34094)) &&
                slt32(gtid_33946, sizze_31215)) {
                float x_chunk_outer_elem_37729 = *(__global
                                                   float *) &images_mem_37894[(gtid_33946 *
                                                                               sizze_31216 +
                                                                               chunk_offset_34095 +
                                                                               ltid_37713) *
                                                                              4];
                
                *(__local float *) &mem_38068[(cid_37730 * tile_sizze_37710 +
                                               cid_37731) * 4] =
                    x_chunk_outer_elem_37729;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(gtid_33946, sizze_31215) && slt32(gtid_33947, res_31237)) { }
        
        float res_34099;
        float sync_37733;
        float acc_34102 = x_34096;
        int32_t groupstream_mapaccum_dummy_chunk_sizze_34100;
        
        groupstream_mapaccum_dummy_chunk_sizze_34100 = 1;
        if (slt32(gtid_33946, sizze_31215) && slt32(gtid_33947, res_31237)) {
            if (chunk_sizze_34094 == tile_sizze_37710) {
                for (int32_t i_34101 = 0; i_34101 < tile_sizze_37710;
                     i_34101++) {
                    float x_34105;
                    float x_34106;
                    bool res_34108;
                    float res_34109;
                    float res_34112;
                    
                    x_34105 = *(__local float *) &mem_38064[(tile_sizze_37710 *
                                                             0 + ltid_37713 +
                                                             tile_sizze_37710 *
                                                             i_34101 + 0 *
                                                             tile_sizze_37710) *
                                                            4];
                    x_34106 = *(__local float *) &mem_38068[(ltid_37712 *
                                                             tile_sizze_37710 +
                                                             i_34101) * 4];
                    res_34108 = futrts_isnan32(x_34106);
                    if (res_34108) {
                        res_34109 = 0.0F;
                    } else {
                        float res_34110 = x_34105 * x_34106;
                        
                        res_34109 = res_34110;
                    }
                    res_34112 = acc_34102 + res_34109;
                    
                    float acc_tmp_38728 = res_34112;
                    
                    acc_34102 = acc_tmp_38728;
                }
            } else {
                for (int32_t i_34101 = 0; i_34101 < chunk_sizze_34094;
                     i_34101++) {
                    float x_34105;
                    float x_34106;
                    bool res_34108;
                    float res_34109;
                    float res_34112;
                    
                    x_34105 = *(__local float *) &mem_38064[(tile_sizze_37710 *
                                                             0 + ltid_37713 +
                                                             tile_sizze_37710 *
                                                             i_34101 + 0 *
                                                             tile_sizze_37710) *
                                                            4];
                    x_34106 = *(__local float *) &mem_38068[(ltid_37712 *
                                                             tile_sizze_37710 +
                                                             i_34101) * 4];
                    res_34108 = futrts_isnan32(x_34106);
                    if (res_34108) {
                        res_34109 = 0.0F;
                    } else {
                        float res_34110 = x_34105 * x_34106;
                        
                        res_34109 = res_34110;
                    }
                    res_34112 = acc_34102 + res_34109;
                    
                    float acc_tmp_38729 = res_34112;
                    
                    acc_34102 = acc_tmp_38729;
                }
            }
        }
        res_34099 = acc_34102;
        sync_37733 = res_34099;
        barrier(CLK_LOCAL_MEM_FENCE);
        x_34096 = sync_37733;
        chunk_offset_34095 += tile_sizze_37710;
    }
    res_34093 = x_34096;
    if (slt32(gtid_33946, sizze_31215) && slt32(gtid_33947, res_31237)) {
        *(__global float *) &mem_38072[(gtid_33946 * res_31237 + gtid_33947) *
                                       4] = res_34093;
    }
}
__kernel void map_34197(int32_t sizze_31215, int32_t res_31237,
                        int32_t j_m_i_31370, __global unsigned char *mem_38090,
                        __global unsigned char *mem_38095, __global
                        unsigned char *mem_38098, __global
                        unsigned char *mem_38102)
{
    const int32_t group_sizze_34218 = mainzigroup_sizze_34191;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_34197;
    int32_t local_tid_34198;
    int32_t group_sizze_38778;
    int32_t wave_sizze_38777;
    int32_t group_id_34199;
    
    global_tid_34197 = get_global_id(0);
    local_tid_34198 = get_local_id(0);
    group_sizze_38778 = get_local_size(0);
    wave_sizze_38777 = LOCKSTEP_WIDTH;
    group_id_34199 = get_group_id(0);
    
    int32_t gtid_34190;
    
    gtid_34190 = global_tid_34197;
    if (slt32(gtid_34190, sizze_31215)) {
        for (int32_t i_34230 = 0; i_34230 < res_31237; i_34230++) {
            float res_34232;
            float redout_34233 = 0.0F;
            
            for (int32_t i_34234 = 0; i_34234 < j_m_i_31370; i_34234++) {
                float x_34235;
                float x_34236;
                float res_34237;
                float res_34240;
                
                x_34235 = *(__global float *) &mem_38090[(i_34234 *
                                                          sizze_31215 +
                                                          gtid_34190) * 4];
                x_34236 = *(__global float *) &mem_38095[(i_34234 *
                                                          (sizze_31215 *
                                                           res_31237) +
                                                          i_34230 *
                                                          sizze_31215 +
                                                          gtid_34190) * 4];
                res_34237 = x_34235 * x_34236;
                res_34240 = redout_34233 + res_34237;
                
                float redout_tmp_38780 = res_34240;
                
                redout_34233 = redout_tmp_38780;
            }
            res_34232 = redout_34233;
            *(__global float *) &mem_38098[(group_id_34199 *
                                            (group_sizze_34218 * res_31237) +
                                            local_tid_34198 + i_34230 *
                                            group_sizze_34218) * 4] = res_34232;
        }
    }
    if (slt32(gtid_34190, sizze_31215)) {
        for (int32_t i_38781 = 0; i_38781 < res_31237; i_38781++) {
            *(__global float *) &mem_38102[(gtid_34190 + i_38781 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38098[(group_id_34199 *
                                                                                       (group_sizze_34218 *
                                                                                        res_31237) +
                                                                                       local_tid_34198 +
                                                                                       i_38781 *
                                                                                       group_sizze_34218) *
                                                                                      4];
        }
    }
}
__kernel void map_34292(int32_t sizze_31215, int32_t res_31237,
                        int32_t j_m_i_31370, __global
                        unsigned char *res_mem_38086, __global
                        unsigned char *mem_38124, __global
                        unsigned char *mem_38128)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_34292;
    int32_t local_tid_34293;
    int32_t group_sizze_38789;
    int32_t wave_sizze_38788;
    int32_t group_id_34294;
    
    global_tid_34292 = get_global_id(0);
    local_tid_34293 = get_local_id(0);
    group_sizze_38789 = get_local_size(0);
    wave_sizze_38788 = LOCKSTEP_WIDTH;
    group_id_34294 = get_group_id(0);
    
    int32_t gtid_34283;
    int32_t gtid_34284;
    
    gtid_34283 = squot32(global_tid_34292, res_31237);
    gtid_34284 = global_tid_34292 - squot32(global_tid_34292, res_31237) *
        res_31237;
    
    int32_t binop_x_37860;
    float res_34424;
    
    if (slt32(gtid_34283, sizze_31215) && slt32(gtid_34284, res_31237)) {
        binop_x_37860 = j_m_i_31370 * gtid_34283;
        
        float x_34427 = 0.0F;
        
        for (int32_t chunk_offset_34426 = 0; chunk_offset_34426 < j_m_i_31370;
             chunk_offset_34426++) {
            int32_t binop_x_37861;
            int32_t new_index_37862;
            int32_t binop_y_37868;
            int32_t new_index_37869;
            float x_34436;
            float x_34437;
            float res_34439;
            float res_34441;
            
            binop_x_37861 = chunk_offset_34426 + binop_x_37860;
            new_index_37862 = squot32(binop_x_37861, res_31237);
            binop_y_37868 = res_31237 * new_index_37862;
            new_index_37869 = binop_x_37861 - binop_y_37868;
            x_34436 = *(__global float *) &res_mem_38086[(new_index_37862 *
                                                          res_31237 +
                                                          new_index_37869) * 4];
            x_34437 = *(__global float *) &mem_38124[(chunk_offset_34426 *
                                                      (res_31237 *
                                                       sizze_31215) +
                                                      gtid_34283 * res_31237 +
                                                      gtid_34284) * 4];
            res_34439 = x_34436 * x_34437;
            res_34441 = x_34427 + res_34439;
            
            float x_tmp_38790 = res_34441;
            
            x_34427 = x_tmp_38790;
        }
        res_34424 = x_34427;
    }
    if (slt32(gtid_34283, sizze_31215) && slt32(gtid_34284, res_31237)) {
        *(__global float *) &mem_38128[(gtid_34283 * res_31237 + gtid_34284) *
                                       4] = res_34424;
    }
}
__kernel void map_34522(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t res_31237, __global unsigned char *mem_37911,
                        __global unsigned char *mem_38146, __global
                        unsigned char *mem_38149, __global
                        unsigned char *mem_38153)
{
    const int32_t group_sizze_34541 = mainzigroup_sizze_34516;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_34522;
    int32_t local_tid_34523;
    int32_t group_sizze_38834;
    int32_t wave_sizze_38833;
    int32_t group_id_34524;
    
    global_tid_34522 = get_global_id(0);
    local_tid_34523 = get_local_id(0);
    group_sizze_38834 = get_local_size(0);
    wave_sizze_38833 = LOCKSTEP_WIDTH;
    group_id_34524 = get_group_id(0);
    
    int32_t gtid_34515;
    
    gtid_34515 = global_tid_34522;
    if (slt32(gtid_34515, sizze_31215)) {
        for (int32_t i_34551 = 0; i_34551 < sizze_31214; i_34551++) {
            float res_34553;
            float redout_34554 = 0.0F;
            
            for (int32_t i_34555 = 0; i_34555 < res_31237; i_34555++) {
                float x_34556;
                float x_34557;
                float res_34558;
                float res_34561;
                
                x_34556 = *(__global float *) &mem_38146[(i_34555 *
                                                          sizze_31215 +
                                                          gtid_34515) * 4];
                x_34557 = *(__global float *) &mem_37911[(i_34551 * res_31237 +
                                                          i_34555) * 4];
                res_34558 = x_34556 * x_34557;
                res_34561 = redout_34554 + res_34558;
                
                float redout_tmp_38836 = res_34561;
                
                redout_34554 = redout_tmp_38836;
            }
            res_34553 = redout_34554;
            *(__global float *) &mem_38149[(group_id_34524 *
                                            (group_sizze_34541 * sizze_31214) +
                                            local_tid_34523 + i_34551 *
                                            group_sizze_34541) * 4] = res_34553;
        }
    }
    if (slt32(gtid_34515, sizze_31215)) {
        for (int32_t i_38837 = 0; i_38837 < sizze_31214; i_38837++) {
            *(__global float *) &mem_38153[(gtid_34515 + i_38837 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38149[(group_id_34524 *
                                                                                       (group_sizze_34541 *
                                                                                        sizze_31214) +
                                                                                       local_tid_34523 +
                                                                                       i_38837 *
                                                                                       group_sizze_34541) *
                                                                                      4];
        }
    }
}
__kernel void map_34610(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t res_31237, __global
                        unsigned char *res_mem_38142, __global
                        unsigned char *mem_38173, __global
                        unsigned char *mem_38185)
{
    const int32_t tile_sizze_37760 = mainzitile_sizze_37759;
    const int32_t tiled_group_sizze_37761 = mainzitile_sizze_37759 *
                  mainzitile_sizze_37759;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(mem_38177_backing_0, 4 *
                         sext_i32_i64(mainzitile_sizze_37759 *
                         mainzitile_sizze_37759));
    ALIGNED_LOCAL_MEMORY(mem_38181_backing_1, 4 *
                         sext_i32_i64(mainzitile_sizze_37759 *
                         mainzitile_sizze_37759));
    
    int32_t global_tid_34610;
    int32_t local_tid_34611;
    int32_t group_sizze_38845;
    int32_t wave_sizze_38844;
    int32_t group_id_34612;
    
    global_tid_34610 = get_global_id(0);
    local_tid_34611 = get_local_id(0);
    group_sizze_38845 = get_local_size(0);
    wave_sizze_38844 = LOCKSTEP_WIDTH;
    group_id_34612 = get_group_id(0);
    
    int32_t gtid_34601;
    int32_t gtid_34602;
    int32_t ltid_37762;
    int32_t ltid_37763;
    
    gtid_34601 = squot32(srem32(global_tid_34610, tile_sizze_37760 *
                                tile_sizze_37760), tile_sizze_37760) +
        squot32(squot32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760),
                squot32(sizze_31214 + tile_sizze_37760 - 1, tile_sizze_37760)) *
        tile_sizze_37760;
    gtid_34602 = srem32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760) -
        squot32(srem32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760),
                tile_sizze_37760) * tile_sizze_37760 +
        (squot32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760) -
         squot32(squot32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760),
                 squot32(sizze_31214 + tile_sizze_37760 - 1,
                         tile_sizze_37760)) * squot32(sizze_31214 +
                                                      tile_sizze_37760 - 1,
                                                      tile_sizze_37760)) *
        tile_sizze_37760;
    ltid_37762 = squot32(srem32(global_tid_34610, tile_sizze_37760 *
                                tile_sizze_37760), tile_sizze_37760);
    ltid_37763 = srem32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760) -
        squot32(srem32(global_tid_34610, tile_sizze_37760 * tile_sizze_37760),
                tile_sizze_37760) * tile_sizze_37760;
    if (slt32(gtid_34601, sizze_31215) && slt32(gtid_34602, sizze_31214)) { }
    
    __local char *mem_38177;
    __local char *mem_38181;
    float res_34742;
    
    mem_38177 = (__local char *) mem_38177_backing_0;
    mem_38181 = (__local char *) mem_38181_backing_1;
    
    float x_34745 = 0.0F;
    int32_t chunk_sizze_34743;
    int32_t chunk_offset_34744 = 0;
    
    while (slt32(chunk_offset_34744, res_31237)) {
        if (slt32(res_31237 - chunk_offset_34744, tile_sizze_37760)) {
            chunk_sizze_34743 = res_31237 - chunk_offset_34744;
        } else {
            chunk_sizze_34743 = tile_sizze_37760;
        }
        for (int32_t comb_iter_38846 = 0; comb_iter_38846 <
             squot32(tile_sizze_37760 * tile_sizze_37760 +
                     tiled_group_sizze_37761 - 1, tiled_group_sizze_37761);
             comb_iter_38846++) {
            int32_t cid_37775;
            int32_t cid_37776;
            int32_t flat_comb_id_38847 = comb_iter_38846 *
                    tiled_group_sizze_37761 + local_tid_34611;
            
            cid_37775 = squot32(flat_comb_id_38847, tile_sizze_37760);
            cid_37776 = flat_comb_id_38847 - squot32(flat_comb_id_38847,
                                                     tile_sizze_37760) *
                tile_sizze_37760;
            if ((slt32(cid_37775, tile_sizze_37760) && slt32(cid_37776,
                                                             chunk_sizze_34743)) &&
                slt32(gtid_34601, sizze_31215)) {
                float x_chunk_outer_elem_37774 = *(__global
                                                   float *) &res_mem_38142[(gtid_34601 *
                                                                            res_31237 +
                                                                            chunk_offset_34744 +
                                                                            ltid_37763) *
                                                                           4];
                
                *(__local float *) &mem_38177[(cid_37775 * tile_sizze_37760 +
                                               cid_37776) * 4] =
                    x_chunk_outer_elem_37774;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(gtid_34601, sizze_31215) && slt32(gtid_34602,
                                                    sizze_31214)) { }
        for (int32_t comb_iter_38848 = 0; comb_iter_38848 <
             squot32(tile_sizze_37760 * tile_sizze_37760 +
                     tiled_group_sizze_37761 - 1, tiled_group_sizze_37761);
             comb_iter_38848++) {
            int32_t cid_37780;
            int32_t cid_37781;
            int32_t flat_comb_id_38849 = comb_iter_38848 *
                    tiled_group_sizze_37761 + local_tid_34611;
            
            cid_37780 = squot32(flat_comb_id_38849, tile_sizze_37760);
            cid_37781 = flat_comb_id_38849 - squot32(flat_comb_id_38849,
                                                     tile_sizze_37760) *
                tile_sizze_37760;
            if ((slt32(cid_37780, chunk_sizze_34743) && slt32(cid_37781,
                                                              tile_sizze_37760)) &&
                slt32(gtid_34602, sizze_31214)) {
                float x_chunk_outer_elem_37779 = *(__global
                                                   float *) &mem_38173[(gtid_34602 +
                                                                        sizze_31214 *
                                                                        chunk_offset_34744 +
                                                                        ltid_37762 *
                                                                        sizze_31214) *
                                                                       4];
                
                *(__local float *) &mem_38181[(cid_37780 * tile_sizze_37760 +
                                               cid_37781) * 4] =
                    x_chunk_outer_elem_37779;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(gtid_34601, sizze_31215) && slt32(gtid_34602,
                                                    sizze_31214)) { }
        
        float res_34748;
        float sync_37783;
        float acc_34751 = x_34745;
        int32_t groupstream_mapaccum_dummy_chunk_sizze_34749;
        
        groupstream_mapaccum_dummy_chunk_sizze_34749 = 1;
        if (slt32(gtid_34601, sizze_31215) && slt32(gtid_34602, sizze_31214)) {
            if (chunk_sizze_34743 == tile_sizze_37760) {
                for (int32_t i_34750 = 0; i_34750 < tile_sizze_37760;
                     i_34750++) {
                    float x_34754;
                    float x_34755;
                    float res_34757;
                    float res_34759;
                    
                    x_34754 = *(__local float *) &mem_38177[(ltid_37762 *
                                                             tile_sizze_37760 +
                                                             i_34750) * 4];
                    x_34755 = *(__local float *) &mem_38181[(tile_sizze_37760 *
                                                             0 + ltid_37763 +
                                                             tile_sizze_37760 *
                                                             i_34750 + 0 *
                                                             tile_sizze_37760) *
                                                            4];
                    res_34757 = x_34754 * x_34755;
                    res_34759 = acc_34751 + res_34757;
                    
                    float acc_tmp_38850 = res_34759;
                    
                    acc_34751 = acc_tmp_38850;
                }
            } else {
                for (int32_t i_34750 = 0; i_34750 < chunk_sizze_34743;
                     i_34750++) {
                    float x_34754;
                    float x_34755;
                    float res_34757;
                    float res_34759;
                    
                    x_34754 = *(__local float *) &mem_38177[(ltid_37762 *
                                                             tile_sizze_37760 +
                                                             i_34750) * 4];
                    x_34755 = *(__local float *) &mem_38181[(tile_sizze_37760 *
                                                             0 + ltid_37763 +
                                                             tile_sizze_37760 *
                                                             i_34750 + 0 *
                                                             tile_sizze_37760) *
                                                            4];
                    res_34757 = x_34754 * x_34755;
                    res_34759 = acc_34751 + res_34757;
                    
                    float acc_tmp_38851 = res_34759;
                    
                    acc_34751 = acc_tmp_38851;
                }
            }
        }
        res_34748 = acc_34751;
        sync_37783 = res_34748;
        barrier(CLK_LOCAL_MEM_FENCE);
        x_34745 = sync_37783;
        chunk_offset_34744 += tile_sizze_37760;
    }
    res_34742 = x_34745;
    if (slt32(gtid_34601, sizze_31215) && slt32(gtid_34602, sizze_31214)) {
        *(__global float *) &mem_38185[(gtid_34601 * sizze_31214 + gtid_34602) *
                                       4] = res_34742;
    }
}
__kernel void map_34858(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t i_31490, __global unsigned char *mem_38203,
                        __global unsigned char *mem_38207, __global
                        unsigned char *mem_38210, __global
                        unsigned char *mem_38213, __global
                        unsigned char *mem_38216, __global
                        unsigned char *mem_38219, __global
                        unsigned char *mem_38225, __global
                        unsigned char *mem_38229, __global
                        unsigned char *mem_38233)
{
    const int32_t group_sizze_34916 = mainzigroup_sizze_34852;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_34858;
    int32_t local_tid_34859;
    int32_t group_sizze_38895;
    int32_t wave_sizze_38894;
    int32_t group_id_34860;
    
    global_tid_34858 = get_global_id(0);
    local_tid_34859 = get_local_id(0);
    group_sizze_38895 = get_local_size(0);
    wave_sizze_38894 = LOCKSTEP_WIDTH;
    group_id_34860 = get_group_id(0);
    
    int32_t gtid_34851;
    
    gtid_34851 = global_tid_34858;
    
    int32_t discard_34928;
    int32_t res_34949;
    
    if (slt32(gtid_34851, sizze_31215)) {
        int32_t scanacc_34931 = 0;
        
        for (int32_t i_34934 = 0; i_34934 < sizze_31214; i_34934++) {
            float x_34935;
            float x_34936;
            bool res_34937;
            bool cond_34938;
            float res_34939;
            bool res_34941;
            bool res_34942;
            int32_t res_34943;
            int32_t res_34946;
            
            x_34935 = *(__global float *) &mem_38203[(i_34934 * sizze_31215 +
                                                      gtid_34851) * 4];
            x_34936 = *(__global float *) &mem_38207[(i_34934 * sizze_31215 +
                                                      gtid_34851) * 4];
            res_34937 = futrts_isnan32(x_34935);
            cond_34938 = !res_34937;
            if (cond_34938) {
                float res_34940 = x_34935 - x_34936;
                
                res_34939 = res_34940;
            } else {
                res_34939 = NAN;
            }
            res_34941 = futrts_isnan32(res_34939);
            res_34942 = !res_34941;
            if (res_34942) {
                res_34943 = 1;
            } else {
                res_34943 = 0;
            }
            res_34946 = scanacc_34931 + res_34943;
            *(__global int32_t *) &mem_38210[(group_id_34860 *
                                              (group_sizze_34916 *
                                               sizze_31214) + local_tid_34859 +
                                              i_34934 * group_sizze_34916) *
                                             4] = res_34946;
            *(__global float *) &mem_38213[(group_id_34860 *
                                            (group_sizze_34916 * sizze_31214) +
                                            local_tid_34859 + i_34934 *
                                            group_sizze_34916) * 4] = res_34939;
            
            int32_t scanacc_tmp_38896 = res_34946;
            
            scanacc_34931 = scanacc_tmp_38896;
        }
        discard_34928 = scanacc_34931;
        res_34949 = *(__global int32_t *) &mem_38210[(group_id_34860 *
                                                      (group_sizze_34916 *
                                                       sizze_31214) +
                                                      local_tid_34859 +
                                                      i_31490 *
                                                      group_sizze_34916) * 4];
        for (int32_t i_38899 = 0; i_38899 < sizze_31214; i_38899++) {
            *(__global float *) &mem_38216[(group_id_34860 *
                                            (group_sizze_34916 * sizze_31214) +
                                            local_tid_34859 + i_38899 *
                                            group_sizze_34916) * 4] = NAN;
        }
        for (int32_t i_38900 = 0; i_38900 < sizze_31214; i_38900++) {
            *(__global int32_t *) &mem_38219[(group_id_34860 *
                                              (group_sizze_34916 *
                                               sizze_31214) + local_tid_34859 +
                                              i_38900 * group_sizze_34916) *
                                             4] = 0;
        }
    }
    
    __private char *mem_38222;
    __private char mem_38222_backing_0[4];
    
    mem_38222 = mem_38222_backing_0;
    if (slt32(gtid_34851, sizze_31215)) {
        for (int32_t write_iter_34956 = 0; write_iter_34956 < sizze_31214;
             write_iter_34956++) {
            float write_iv_34957;
            int32_t write_iv_34958;
            bool res_34963;
            bool res_34964;
            int32_t res_34965;
            bool less_than_zzero_34967;
            bool greater_than_sizze_34968;
            bool outside_bounds_dim_34969;
            
            write_iv_34957 = *(__global float *) &mem_38213[(group_id_34860 *
                                                             (group_sizze_34916 *
                                                              sizze_31214) +
                                                             local_tid_34859 +
                                                             write_iter_34956 *
                                                             group_sizze_34916) *
                                                            4];
            write_iv_34958 = *(__global int32_t *) &mem_38210[(group_id_34860 *
                                                               (group_sizze_34916 *
                                                                sizze_31214) +
                                                               local_tid_34859 +
                                                               write_iter_34956 *
                                                               group_sizze_34916) *
                                                              4];
            res_34963 = futrts_isnan32(write_iv_34957);
            res_34964 = !res_34963;
            if (res_34964) {
                int32_t res_34966 = write_iv_34958 - 1;
                
                res_34965 = res_34966;
            } else {
                res_34965 = -1;
            }
            less_than_zzero_34967 = slt32(res_34965, 0);
            greater_than_sizze_34968 = sle32(sizze_31214, res_34965);
            outside_bounds_dim_34969 = less_than_zzero_34967 ||
                greater_than_sizze_34968;
            if (!outside_bounds_dim_34969) {
                int32_t x_38904;
                
                for (int32_t i_38903 = 0; i_38903 < 1; i_38903++) {
                    x_38904 = write_iter_34956 + sext_i32_i32(i_38903);
                    *(__private int32_t *) &mem_38222[i_38903 * 4] = x_38904;
                }
                for (int32_t i_38905 = 0; i_38905 < 1; i_38905++) {
                    *(__global int32_t *) &mem_38219[(group_id_34860 *
                                                      (group_sizze_34916 *
                                                       sizze_31214) +
                                                      local_tid_34859 +
                                                      group_sizze_34916 *
                                                      res_34965 + i_38905 *
                                                      group_sizze_34916) * 4] =
                        *(__private int32_t *) &mem_38222[i_38905 * 4];
                }
            }
            if (!outside_bounds_dim_34969) {
                for (int32_t i_38906 = 0; i_38906 < 1; i_38906++) {
                    *(__global float *) &mem_38216[(group_id_34860 *
                                                    (group_sizze_34916 *
                                                     sizze_31214) +
                                                    local_tid_34859 +
                                                    group_sizze_34916 *
                                                    res_34965 + i_38906 *
                                                    group_sizze_34916) * 4] =
                        *(__global float *) &mem_38213[(group_id_34860 *
                                                        (group_sizze_34916 *
                                                         sizze_31214) +
                                                        local_tid_34859 +
                                                        group_sizze_34916 *
                                                        write_iter_34956 +
                                                        i_38906 *
                                                        group_sizze_34916) * 4];
                }
            }
        }
    }
    if (slt32(gtid_34851, sizze_31215)) {
        *(__global int32_t *) &mem_38225[gtid_34851 * 4] = res_34949;
    }
    if (slt32(gtid_34851, sizze_31215)) {
        for (int32_t i_38907 = 0; i_38907 < sizze_31214; i_38907++) {
            *(__global float *) &mem_38229[(gtid_34851 + i_38907 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38216[(group_id_34860 *
                                                                                       (group_sizze_34916 *
                                                                                        sizze_31214) +
                                                                                       local_tid_34859 +
                                                                                       i_38907 *
                                                                                       group_sizze_34916) *
                                                                                      4];
        }
    }
    if (slt32(gtid_34851, sizze_31215)) {
        for (int32_t i_38908 = 0; i_38908 < sizze_31214; i_38908++) {
            *(__global int32_t *) &mem_38233[(gtid_34851 + i_38908 *
                                              sizze_31215) * 4] = *(__global
                                                                    int32_t *) &mem_38219[(group_id_34860 *
                                                                                           (group_sizze_34916 *
                                                                                            sizze_31214) +
                                                                                           local_tid_34859 +
                                                                                           i_38908 *
                                                                                           group_sizze_34916) *
                                                                                          4];
        }
    }
}
__kernel void map_35033(int32_t sizze_31214, int32_t sizze_31215, __global
                        unsigned char *mem_38270, __global
                        unsigned char *mem_38274, __global
                        unsigned char *mem_38281, __global
                        unsigned char *mem_38285)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_35033;
    int32_t local_tid_35034;
    int32_t group_sizze_38984;
    int32_t wave_sizze_38983;
    int32_t group_id_35035;
    
    global_tid_35033 = get_global_id(0);
    local_tid_35034 = get_local_id(0);
    group_sizze_38984 = get_local_size(0);
    wave_sizze_38983 = LOCKSTEP_WIDTH;
    group_id_35035 = get_group_id(0);
    
    int32_t gtid_35024;
    int32_t gtid_35025;
    
    gtid_35024 = squot32(global_tid_35033, sizze_31214);
    gtid_35025 = global_tid_35033 - squot32(global_tid_35033, sizze_31214) *
        sizze_31214;
    
    float x_35235;
    int32_t x_35236;
    bool res_35238;
    bool res_35239;
    int32_t res_35240;
    
    if (slt32(gtid_35024, sizze_31215) && slt32(gtid_35025, sizze_31214)) {
        x_35235 = *(__global float *) &mem_38274[(gtid_35024 * sizze_31214 +
                                                  gtid_35025) * 4];
        x_35236 = *(__global int32_t *) &mem_38270[(gtid_35024 * sizze_31214 +
                                                    gtid_35025) * 4];
        res_35238 = futrts_isnan32(x_35235);
        res_35239 = !res_35238;
        if (res_35239) {
            int32_t res_35241 = x_35236 - 1;
            
            res_35240 = res_35241;
        } else {
            res_35240 = -1;
        }
    }
    if (((slt32(gtid_35024, sizze_31215) && slt32(gtid_35025, sizze_31214)) &&
         (sle32(0, gtid_35024) && slt32(gtid_35024, sizze_31215))) && (sle32(0,
                                                                             res_35240) &&
                                                                       slt32(res_35240,
                                                                             sizze_31214))) {
        *(__global int32_t *) &mem_38285[(gtid_35024 * sizze_31214 +
                                          res_35240) * 4] = gtid_35025;
    }
    if (((slt32(gtid_35024, sizze_31215) && slt32(gtid_35025, sizze_31214)) &&
         (sle32(0, gtid_35024) && slt32(gtid_35024, sizze_31215))) && (sle32(0,
                                                                             res_35240) &&
                                                                       slt32(res_35240,
                                                                             sizze_31214))) {
        *(__global float *) &mem_38281[(gtid_35024 * sizze_31214 + res_35240) *
                                       4] = x_35235;
    }
}
__kernel void map_35287(int32_t sizze_31215, int32_t n_31219, float hfrac_31221,
                        int32_t res_31235, __global unsigned char *mem_38295,
                        __global unsigned char *mem_38299, __global
                        unsigned char *mem_38302, __global
                        unsigned char *mem_38305, __global
                        unsigned char *mem_38308)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_35287;
    int32_t local_tid_35288;
    int32_t group_sizze_38986;
    int32_t wave_sizze_38985;
    int32_t group_id_35289;
    
    global_tid_35287 = get_global_id(0);
    local_tid_35288 = get_local_id(0);
    group_sizze_38986 = get_local_size(0);
    wave_sizze_38985 = LOCKSTEP_WIDTH;
    group_id_35289 = get_group_id(0);
    
    int32_t gtid_35280;
    
    gtid_35280 = global_tid_35287;
    
    int32_t res_35344;
    float res_35361;
    int32_t arg_35379;
    float res_35380;
    float arg_35381;
    float res_35382;
    float res_35383;
    float arg_35384;
    int32_t res_35385;
    
    if (slt32(gtid_35280, sizze_31215)) {
        int32_t x_35347 = 0;
        
        for (int32_t chunk_offset_35346 = 0; chunk_offset_35346 < n_31219;
             chunk_offset_35346++) {
            float x_35354;
            bool res_35356;
            bool cond_35357;
            int32_t res_35358;
            int32_t res_35360;
            
            x_35354 = *(__global float *) &mem_38295[(chunk_offset_35346 *
                                                      sizze_31215 +
                                                      gtid_35280) * 4];
            res_35356 = futrts_isnan32(x_35354);
            cond_35357 = !res_35356;
            if (cond_35357) {
                res_35358 = 1;
            } else {
                res_35358 = 0;
            }
            res_35360 = x_35347 + res_35358;
            
            int32_t x_tmp_38987 = res_35360;
            
            x_35347 = x_tmp_38987;
        }
        res_35344 = x_35347;
        
        float x_35364 = 0.0F;
        
        for (int32_t chunk_offset_35363 = 0; chunk_offset_35363 < n_31219;
             chunk_offset_35363++) {
            bool cond_35373;
            float res_35374;
            float res_35376;
            float res_35378;
            
            cond_35373 = slt32(chunk_offset_35363, res_35344);
            if (cond_35373) {
                float res_35375 = *(__global
                                    float *) &mem_38299[(chunk_offset_35363 *
                                                         sizze_31215 +
                                                         gtid_35280) * 4];
                
                res_35374 = res_35375;
            } else {
                res_35374 = 0.0F;
            }
            res_35376 = res_35374 * res_35374;
            res_35378 = x_35364 + res_35376;
            
            float x_tmp_38988 = res_35378;
            
            x_35364 = x_tmp_38988;
        }
        res_35361 = x_35364;
        arg_35379 = res_35344 - res_31235;
        res_35380 = sitofp_i32_f32(arg_35379);
        arg_35381 = res_35361 / res_35380;
        res_35382 = futrts_sqrt32(arg_35381);
        res_35383 = sitofp_i32_f32(res_35344);
        arg_35384 = hfrac_31221 * res_35383;
        res_35385 = fptosi_f32_i32(arg_35384);
    }
    if (slt32(gtid_35280, sizze_31215)) {
        *(__global int32_t *) &mem_38302[gtid_35280 * 4] = res_35385;
    }
    if (slt32(gtid_35280, sizze_31215)) {
        *(__global int32_t *) &mem_38305[gtid_35280 * 4] = res_35344;
    }
    if (slt32(gtid_35280, sizze_31215)) {
        *(__global float *) &mem_38308[gtid_35280 * 4] = res_35382;
    }
}
__kernel void map_35432(int32_t sizze_31215, float hfrac_31221,
                        int32_t res_31235, __global unsigned char *mem_38326,
                        __global unsigned char *mem_38329, __global
                        unsigned char *mem_38332, __global
                        unsigned char *mem_38335)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_35432;
    int32_t local_tid_35433;
    int32_t group_sizze_39073;
    int32_t wave_sizze_39072;
    int32_t group_id_35434;
    
    global_tid_35432 = get_global_id(0);
    local_tid_35433 = get_local_id(0);
    group_sizze_39073 = get_local_size(0);
    wave_sizze_39072 = LOCKSTEP_WIDTH;
    group_id_35434 = get_group_id(0);
    
    int32_t gtid_35425;
    
    gtid_35425 = global_tid_35432;
    
    int32_t res_35559;
    float res_35560;
    int32_t arg_35561;
    float res_35562;
    float arg_35563;
    float res_35564;
    float res_35565;
    float arg_35566;
    int32_t res_35567;
    
    if (slt32(gtid_35425, sizze_31215)) {
        res_35559 = *(__global int32_t *) &mem_38326[gtid_35425 * 4];
        res_35560 = *(__global float *) &mem_38329[gtid_35425 * 4];
        arg_35561 = res_35559 - res_31235;
        res_35562 = sitofp_i32_f32(arg_35561);
        arg_35563 = res_35560 / res_35562;
        res_35564 = futrts_sqrt32(arg_35563);
        res_35565 = sitofp_i32_f32(res_35559);
        arg_35566 = hfrac_31221 * res_35565;
        res_35567 = fptosi_f32_i32(arg_35566);
    }
    if (slt32(gtid_35425, sizze_31215)) {
        *(__global int32_t *) &mem_38332[gtid_35425 * 4] = res_35567;
    }
    if (slt32(gtid_35425, sizze_31215)) {
        *(__global float *) &mem_38335[gtid_35425 * 4] = res_35564;
    }
}
__kernel void map_35624(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t res_31594, __global
                        unsigned char *res_mem_38290, __global
                        unsigned char *res_mem_38339, __global
                        unsigned char *res_mem_38340, __global
                        unsigned char *mem_38347)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_35624;
    int32_t local_tid_35625;
    int32_t group_sizze_39097;
    int32_t wave_sizze_39096;
    int32_t group_id_35626;
    
    global_tid_35624 = get_global_id(0);
    local_tid_35625 = get_local_id(0);
    group_sizze_39097 = get_local_size(0);
    wave_sizze_39096 = LOCKSTEP_WIDTH;
    group_id_35626 = get_group_id(0);
    
    int32_t gtid_35617;
    
    gtid_35617 = global_tid_35624;
    
    int32_t x_35657;
    int32_t x_35658;
    float res_35659;
    
    if (slt32(gtid_35617, sizze_31215)) {
        x_35657 = *(__global int32_t *) &res_mem_38340[gtid_35617 * 4];
        x_35658 = *(__global int32_t *) &res_mem_38339[gtid_35617 * 4];
        
        float x_35662 = 0.0F;
        
        for (int32_t chunk_offset_35661 = 0; chunk_offset_35661 < res_31594;
             chunk_offset_35661++) {
            bool cond_35671;
            float res_35672;
            float res_35678;
            
            cond_35671 = slt32(chunk_offset_35661, x_35658);
            if (cond_35671) {
                int32_t x_35673;
                int32_t x_35674;
                int32_t i_35675;
                float res_35676;
                
                x_35673 = x_35657 + chunk_offset_35661;
                x_35674 = x_35673 - x_35658;
                i_35675 = 1 + x_35674;
                res_35676 = *(__global float *) &res_mem_38290[(gtid_35617 *
                                                                sizze_31214 +
                                                                i_35675) * 4];
                res_35672 = res_35676;
            } else {
                res_35672 = 0.0F;
            }
            res_35678 = x_35662 + res_35672;
            
            float x_tmp_39098 = res_35678;
            
            x_35662 = x_tmp_39098;
        }
        res_35659 = x_35662;
    }
    if (slt32(gtid_35617, sizze_31215)) {
        *(__global float *) &mem_38347[gtid_35617 * 4] = res_35659;
    }
}
__kernel void map_35807(float lam_31222, int32_t arg_31616, int32_t x_31629,
                        float res_31632, __global
                        unsigned char *mappingindices_mem_37893, __global
                        unsigned char *mem_38361)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_35807;
    int32_t local_tid_35808;
    int32_t group_sizze_39142;
    int32_t wave_sizze_39141;
    int32_t group_id_35809;
    
    global_tid_35807 = get_global_id(0);
    local_tid_35808 = get_local_id(0);
    group_sizze_39142 = get_local_size(0);
    wave_sizze_39141 = LOCKSTEP_WIDTH;
    group_id_35809 = get_group_id(0);
    
    int32_t gtid_35800;
    
    gtid_35800 = global_tid_35807;
    
    int32_t res_35828;
    int32_t i_35829;
    int32_t res_35830;
    float res_35831;
    float arg_35832;
    bool cond_35833;
    float res_35834;
    float res_35836;
    float res_35837;
    
    if (slt32(gtid_35800, arg_31616)) {
        res_35828 = x_31629 + gtid_35800;
        i_35829 = res_35828 - 1;
        res_35830 = *(__global int32_t *) &mappingindices_mem_37893[i_35829 *
                                                                    4];
        res_35831 = sitofp_i32_f32(res_35830);
        arg_35832 = res_35831 / res_31632;
        cond_35833 = 2.7182817F < arg_35832;
        if (cond_35833) {
            float res_35835;
            
            res_35835 = futrts_log32(arg_35832);
            res_35834 = res_35835;
        } else {
            res_35834 = 1.0F;
        }
        res_35836 = futrts_sqrt32(res_35834);
        res_35837 = lam_31222 * res_35836;
    }
    if (slt32(gtid_35800, arg_31616)) {
        *(__global float *) &mem_38361[gtid_35800 * 4] = res_35837;
    }
}
__kernel void map_35892(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t n_31219, int32_t arg_31616, __global
                        unsigned char *res_mem_38289, __global
                        unsigned char *res_mem_38290, __global
                        unsigned char *res_mem_38291, __global
                        unsigned char *res_mem_38339, __global
                        unsigned char *res_mem_38340, __global
                        unsigned char *res_mem_38341, __global
                        unsigned char *res_mem_38358, __global
                        unsigned char *mem_38361, __global
                        unsigned char *mem_38364, __global
                        unsigned char *mem_38370, __global
                        unsigned char *mem_38373, __global
                        unsigned char *mem_38376, __global
                        unsigned char *mem_38380, __global
                        unsigned char *mem_38384, __global
                        unsigned char *mem_38387, __global
                        unsigned char *mem_38390)
{
    const int32_t group_sizze_36025 = mainzigroup_sizze_35886;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(mem_38367_backing_0, 4 *
                         sext_i32_i64(mainzigroup_sizze_35886));
    
    int32_t global_tid_35892;
    int32_t local_tid_35893;
    int32_t group_sizze_39144;
    int32_t wave_sizze_39143;
    int32_t group_id_35894;
    
    global_tid_35892 = get_global_id(0);
    local_tid_35893 = get_local_id(0);
    group_sizze_39144 = get_local_size(0);
    wave_sizze_39143 = LOCKSTEP_WIDTH;
    group_id_35894 = get_group_id(0);
    
    int32_t gtid_35885;
    
    gtid_35885 = global_tid_35892;
    
    int32_t x_36034;
    float x_36035;
    int32_t x_36036;
    float x_36037;
    int32_t copy_p_36040;
    int32_t y_36041;
    float res_36042;
    float res_36043;
    float y_36044;
    
    if (slt32(gtid_35885, sizze_31215)) {
        x_36034 = *(__global int32_t *) &res_mem_38340[gtid_35885 * 4];
        x_36035 = *(__global float *) &res_mem_38341[gtid_35885 * 4];
        x_36036 = *(__global int32_t *) &res_mem_38339[gtid_35885 * 4];
        x_36037 = *(__global float *) &res_mem_38358[gtid_35885 * 4];
        copy_p_36040 = *(__global int32_t *) &res_mem_38289[gtid_35885 * 4];
        y_36041 = copy_p_36040 - x_36034;
        res_36042 = sitofp_i32_f32(x_36034);
        res_36043 = futrts_sqrt32(res_36042);
        y_36044 = x_36035 * res_36043;
    }
    
    __local char *mem_38367;
    float inpacc_36046;
    bool res_36047;
    int32_t res_36048;
    float res_36049;
    
    mem_38367 = (__local char *) mem_38367_backing_0;
    
    float inpacc_36053;
    bool inpacc_36054;
    int32_t inpacc_36055;
    float inpacc_36056;
    
    inpacc_36053 = 0.0F;
    inpacc_36054 = 0;
    inpacc_36055 = -1;
    inpacc_36056 = 0.0F;
    
    int32_t chunk_36051;
    int32_t streamseq_chunk_offset_36052 = 0;
    
    while (slt32(streamseq_chunk_offset_36052, arg_31616)) {
        if (slt32(arg_31616 - streamseq_chunk_offset_36052,
                  group_sizze_36025)) {
            chunk_36051 = arg_31616 - streamseq_chunk_offset_36052;
        } else {
            chunk_36051 = group_sizze_36025;
        }
        for (int32_t comb_iter_39145 = 0; comb_iter_39145 < 1;
             comb_iter_39145++) {
            int32_t cid_37786;
            int32_t flat_comb_id_39146 = comb_iter_39145 * group_sizze_36025 +
                    local_tid_35893;
            
            cid_37786 = flat_comb_id_39146;
            if (slt32(cid_37786, chunk_36051) && 1) {
                float inp_outer_elem_37785 = *(__global
                                               float *) &mem_38361[(streamseq_chunk_offset_36052 +
                                                                    local_tid_35893) *
                                                                   4];
                
                *(__local float *) &mem_38367[cid_37786 * 4] =
                    inp_outer_elem_37785;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        float discard_36061;
        int32_t szzm1_36082;
        bool empty_arr_36083;
        float lstel_36084;
        float res_36086;
        bool acc0_36088;
        int32_t acc0_36089;
        float acc0_36090;
        bool res_36124;
        int32_t res_36125;
        float res_36130;
        
        if (slt32(gtid_35885, sizze_31215)) {
            float scanacc_36063 = 0.0F;
            
            for (int32_t i_36065 = 0; i_36065 < chunk_36051; i_36065++) {
                int32_t convop_x_37097;
                bool cond_36067;
                float res_36068;
                float res_36080;
                
                convop_x_37097 = streamseq_chunk_offset_36052 + i_36065;
                cond_36067 = sle32(y_36041, convop_x_37097);
                if (cond_36067) {
                    res_36068 = 0.0F;
                } else {
                    bool cond_36069;
                    float res_36070;
                    
                    cond_36069 = convop_x_37097 == 0;
                    if (cond_36069) {
                        res_36070 = x_36037;
                    } else {
                        int32_t x_36071;
                        int32_t i_36072;
                        float negate_arg_36073;
                        float x_36074;
                        int32_t i_36075;
                        float y_36076;
                        float res_36077;
                        
                        x_36071 = x_36034 - x_36036;
                        i_36072 = x_36071 + convop_x_37097;
                        negate_arg_36073 = *(__global
                                             float *) &res_mem_38290[(gtid_35885 *
                                                                      sizze_31214 +
                                                                      i_36072) *
                                                                     4];
                        x_36074 = 0.0F - negate_arg_36073;
                        i_36075 = x_36034 + convop_x_37097;
                        y_36076 = *(__global
                                    float *) &res_mem_38290[(gtid_35885 *
                                                             sizze_31214 +
                                                             i_36075) * 4];
                        res_36077 = x_36074 + y_36076;
                        res_36070 = res_36077;
                    }
                    res_36068 = res_36070;
                }
                res_36080 = scanacc_36063 + res_36068;
                *(__global float *) &mem_38370[(group_id_35894 *
                                                (group_sizze_36025 *
                                                 chunk_36051) +
                                                local_tid_35893 + i_36065 *
                                                group_sizze_36025) * 4] =
                    res_36080;
                
                float scanacc_tmp_39147 = res_36080;
                
                scanacc_36063 = scanacc_tmp_39147;
            }
            discard_36061 = scanacc_36063;
            szzm1_36082 = chunk_36051 - 1;
            empty_arr_36083 = slt32(szzm1_36082, 0);
            if (empty_arr_36083) {
                lstel_36084 = 0.0F;
            } else {
                float lstel_tmp_36085 = *(__global
                                          float *) &mem_38370[(group_id_35894 *
                                                               (group_sizze_36025 *
                                                                chunk_36051) +
                                                               local_tid_35893 +
                                                               szzm1_36082 *
                                                               group_sizze_36025) *
                                                              4];
                
                lstel_36084 = lstel_tmp_36085;
            }
            res_36086 = inpacc_36053 + lstel_36084;
            
            bool redout_36092;
            int32_t redout_36093;
            float redout_36094;
            
            redout_36092 = 0;
            redout_36093 = -1;
            redout_36094 = 0.0F;
            for (int32_t i_36096 = 0; i_36096 < chunk_36051; i_36096++) {
                float x_36097;
                float x_36098;
                int32_t convop_x_37099;
                int32_t x_36099;
                float res_36100;
                float res_36101;
                bool cond_36102;
                bool res_36103;
                bool res_36104;
                bool x_36105;
                float res_36106;
                bool res_36107;
                bool x_36108;
                float res_36109;
                bool res_36116;
                int32_t res_36117;
                float res_36122;
                
                x_36097 = *(__global float *) &mem_38370[(group_id_35894 *
                                                          (group_sizze_36025 *
                                                           chunk_36051) +
                                                          local_tid_35893 +
                                                          i_36096 *
                                                          group_sizze_36025) *
                                                         4];
                x_36098 = *(__local float *) &mem_38367[i_36096 * 4];
                convop_x_37099 = streamseq_chunk_offset_36052 + i_36096;
                x_36099 = convop_x_37099;
                res_36100 = inpacc_36053 + x_36097;
                res_36101 = res_36100 / y_36044;
                cond_36102 = slt32(convop_x_37099, y_36041);
                res_36103 = futrts_isnan32(res_36101);
                res_36104 = !res_36103;
                x_36105 = cond_36102 && res_36104;
                res_36106 = (float) fabs(res_36101);
                res_36107 = x_36098 < res_36106;
                x_36108 = x_36105 && res_36107;
                if (cond_36102) {
                    res_36109 = res_36101;
                } else {
                    res_36109 = 0.0F;
                }
                if (redout_36092) {
                    res_36116 = redout_36092;
                    res_36117 = redout_36093;
                } else {
                    bool x_36118;
                    bool y_36119;
                    bool res_36120;
                    int32_t res_36121;
                    
                    x_36118 = !x_36108;
                    y_36119 = redout_36092 && x_36118;
                    res_36120 = x_36108 || y_36119;
                    if (x_36108) {
                        res_36121 = x_36099;
                    } else {
                        res_36121 = redout_36093;
                    }
                    res_36116 = res_36120;
                    res_36117 = res_36121;
                }
                res_36122 = redout_36094 + res_36109;
                *(__global float *) &mem_38373[(group_id_35894 *
                                                (group_sizze_36025 *
                                                 chunk_36051) +
                                                local_tid_35893 + i_36096 *
                                                group_sizze_36025) * 4] =
                    res_36101;
                
                bool redout_tmp_39149 = res_36116;
                int32_t redout_tmp_39150 = res_36117;
                float redout_tmp_39151;
                
                redout_tmp_39151 = res_36122;
                redout_36092 = redout_tmp_39149;
                redout_36093 = redout_tmp_39150;
                redout_36094 = redout_tmp_39151;
            }
            acc0_36088 = redout_36092;
            acc0_36089 = redout_36093;
            acc0_36090 = redout_36094;
            if (inpacc_36054) {
                res_36124 = inpacc_36054;
                res_36125 = inpacc_36055;
            } else {
                bool x_36126;
                bool y_36127;
                bool res_36128;
                int32_t res_36129;
                
                x_36126 = !acc0_36088;
                y_36127 = inpacc_36054 && x_36126;
                res_36128 = acc0_36088 || y_36127;
                if (acc0_36088) {
                    res_36129 = acc0_36089;
                } else {
                    res_36129 = inpacc_36055;
                }
                res_36124 = res_36128;
                res_36125 = res_36129;
            }
            res_36130 = inpacc_36056 + acc0_36090;
            for (int32_t i_39153 = 0; i_39153 < chunk_36051; i_39153++) {
                *(__global float *) &mem_38364[(group_id_35894 *
                                                (group_sizze_36025 *
                                                 arg_31616) + local_tid_35893 +
                                                group_sizze_36025 *
                                                streamseq_chunk_offset_36052 +
                                                i_39153 * group_sizze_36025) *
                                               4] = *(__global
                                                      float *) &mem_38373[(group_id_35894 *
                                                                           (group_sizze_36025 *
                                                                            chunk_36051) +
                                                                           local_tid_35893 +
                                                                           i_39153 *
                                                                           group_sizze_36025) *
                                                                          4];
            }
        }
        
        float sync_37787;
        bool sync_37788;
        int32_t sync_37789;
        float sync_37790;
        
        sync_37787 = res_36086;
        sync_37788 = res_36124;
        sync_37789 = res_36125;
        sync_37790 = res_36130;
        barrier(CLK_LOCAL_MEM_FENCE);
        inpacc_36053 = sync_37787;
        inpacc_36054 = sync_37788;
        inpacc_36055 = sync_37789;
        inpacc_36056 = sync_37790;
        streamseq_chunk_offset_36052 += group_sizze_36025;
    }
    inpacc_36046 = inpacc_36053;
    res_36047 = inpacc_36054;
    res_36048 = inpacc_36055;
    res_36049 = inpacc_36056;
    
    bool cond_36132;
    int32_t res_36133;
    bool cond_36139;
    bool res_36140;
    bool x_36141;
    bool y_36142;
    bool cond_36143;
    int32_t res_36144;
    
    if (slt32(gtid_35885, sizze_31215)) {
        cond_36132 = !res_36047;
        if (cond_36132) {
            res_36133 = -1;
        } else {
            bool cond_36134;
            int32_t res_36135;
            
            cond_36134 = slt32(res_36048, y_36041);
            if (cond_36134) {
                int32_t i_36136;
                int32_t x_36137;
                int32_t res_36138;
                
                i_36136 = x_36034 + res_36048;
                x_36137 = *(__global int32_t *) &res_mem_38291[(gtid_35885 *
                                                                sizze_31214 +
                                                                i_36136) * 4];
                res_36138 = x_36137 - n_31219;
                res_36135 = res_36138;
            } else {
                res_36135 = -1;
            }
            res_36133 = res_36135;
        }
        cond_36139 = sle32(x_36034, 5);
        res_36140 = sle32(y_36041, 5);
        x_36141 = !cond_36139;
        y_36142 = res_36140 && x_36141;
        cond_36143 = cond_36139 || y_36142;
        if (cond_36143) {
            res_36144 = -2;
        } else {
            res_36144 = res_36133;
        }
        for (int32_t i_39154 = 0; i_39154 < arg_31616; i_39154++) {
            *(__global float *) &mem_38376[(group_id_35894 *
                                            (group_sizze_36025 * arg_31616) +
                                            local_tid_35893 + i_39154 *
                                            group_sizze_36025) * 4] = NAN;
        }
        for (int32_t write_iter_36148 = 0; write_iter_36148 < arg_31616;
             write_iter_36148++) {
            bool cond_36153;
            int32_t res_36154;
            bool less_than_zzero_36158;
            bool greater_than_sizze_36159;
            bool outside_bounds_dim_36160;
            
            cond_36153 = slt32(write_iter_36148, y_36041);
            if (cond_36153) {
                int32_t i_36155;
                int32_t x_36156;
                int32_t res_36157;
                
                i_36155 = x_36034 + write_iter_36148;
                x_36156 = *(__global int32_t *) &res_mem_38291[(gtid_35885 *
                                                                sizze_31214 +
                                                                i_36155) * 4];
                res_36157 = x_36156 - n_31219;
                res_36154 = res_36157;
            } else {
                res_36154 = -1;
            }
            less_than_zzero_36158 = slt32(res_36154, 0);
            greater_than_sizze_36159 = sle32(arg_31616, res_36154);
            outside_bounds_dim_36160 = less_than_zzero_36158 ||
                greater_than_sizze_36159;
            if (!outside_bounds_dim_36160) {
                for (int32_t i_39156 = 0; i_39156 < 1; i_39156++) {
                    *(__global float *) &mem_38376[(group_id_35894 *
                                                    (group_sizze_36025 *
                                                     arg_31616) +
                                                    local_tid_35893 +
                                                    group_sizze_36025 *
                                                    res_36154 + i_39156 *
                                                    group_sizze_36025) * 4] =
                        *(__global float *) &mem_38364[(group_id_35894 *
                                                        (group_sizze_36025 *
                                                         arg_31616) +
                                                        local_tid_35893 +
                                                        group_sizze_36025 *
                                                        write_iter_36148 +
                                                        i_39156 *
                                                        group_sizze_36025) * 4];
                }
            }
        }
    }
    if (slt32(gtid_35885, sizze_31215)) {
        for (int32_t i_39157 = 0; i_39157 < arg_31616; i_39157++) {
            *(__global float *) &mem_38380[(gtid_35885 + i_39157 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38376[(group_id_35894 *
                                                                                       (group_sizze_36025 *
                                                                                        arg_31616) +
                                                                                       local_tid_35893 +
                                                                                       i_39157 *
                                                                                       group_sizze_36025) *
                                                                                      4];
        }
    }
    if (slt32(gtid_35885, sizze_31215)) {
        for (int32_t i_39158 = 0; i_39158 < arg_31616; i_39158++) {
            *(__global float *) &mem_38384[(gtid_35885 + i_39158 *
                                            sizze_31215) * 4] = *(__global
                                                                  float *) &mem_38364[(group_id_35894 *
                                                                                       (group_sizze_36025 *
                                                                                        arg_31616) +
                                                                                       local_tid_35893 +
                                                                                       i_39158 *
                                                                                       group_sizze_36025) *
                                                                                      4];
        }
    }
    if (slt32(gtid_35885, sizze_31215)) {
        *(__global int32_t *) &mem_38387[gtid_35885 * 4] = res_36144;
    }
    if (slt32(gtid_35885, sizze_31215)) {
        *(__global float *) &mem_38390[gtid_35885 * 4] = res_36049;
    }
}
__kernel void map_36295(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t n_31219, int32_t arg_31616, __global
                        unsigned char *res_mem_38291, __global
                        unsigned char *res_mem_38340, __global
                        unsigned char *mem_38437, __global
                        unsigned char *mem_38453, __global
                        unsigned char *mem_38466)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_36295;
    int32_t local_tid_36296;
    int32_t group_sizze_39302;
    int32_t wave_sizze_39301;
    int32_t group_id_36297;
    
    global_tid_36295 = get_global_id(0);
    local_tid_36296 = get_local_id(0);
    group_sizze_39302 = get_local_size(0);
    wave_sizze_39301 = LOCKSTEP_WIDTH;
    group_id_36297 = get_group_id(0);
    
    int32_t gtid_36286;
    int32_t gtid_36287;
    
    gtid_36286 = squot32(global_tid_36295, arg_31616);
    gtid_36287 = global_tid_36295 - squot32(global_tid_36295, arg_31616) *
        arg_31616;
    
    int32_t x_36703;
    int32_t y_36705;
    float write_value_36709;
    bool cond_36710;
    int32_t res_36711;
    
    if (slt32(gtid_36286, sizze_31215) && slt32(gtid_36287, arg_31616)) {
        x_36703 = *(__global int32_t *) &res_mem_38340[gtid_36286 * 4];
        y_36705 = *(__global int32_t *) &mem_38437[gtid_36286 * 4];
        write_value_36709 = *(__global float *) &mem_38453[(gtid_36286 *
                                                            arg_31616 +
                                                            gtid_36287) * 4];
        cond_36710 = slt32(gtid_36287, y_36705);
        if (cond_36710) {
            int32_t i_36712;
            int32_t x_36713;
            int32_t res_36714;
            
            i_36712 = gtid_36287 + x_36703;
            x_36713 = *(__global int32_t *) &res_mem_38291[(gtid_36286 *
                                                            sizze_31214 +
                                                            i_36712) * 4];
            res_36714 = x_36713 - n_31219;
            res_36711 = res_36714;
        } else {
            res_36711 = -1;
        }
    }
    if (((slt32(gtid_36286, sizze_31215) && slt32(gtid_36287, arg_31616)) &&
         (sle32(0, gtid_36286) && slt32(gtid_36286, sizze_31215))) && (sle32(0,
                                                                             res_36711) &&
                                                                       slt32(res_36711,
                                                                             arg_31616))) {
        *(__global float *) &mem_38466[(gtid_36286 * arg_31616 + res_36711) *
                                       4] = write_value_36709;
    }
}
__kernel void map_36344(int32_t sizze_31214, int32_t sizze_31215,
                        int32_t n_31219, __global unsigned char *res_mem_38291,
                        __global unsigned char *res_mem_38340, __global
                        unsigned char *mem_38437, __global
                        unsigned char *mem_38443, __global
                        unsigned char *mem_38459, __global
                        unsigned char *mem_38462)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_36344;
    int32_t local_tid_36345;
    int32_t group_sizze_39295;
    int32_t wave_sizze_39294;
    int32_t group_id_36346;
    
    global_tid_36344 = get_global_id(0);
    local_tid_36345 = get_local_id(0);
    group_sizze_39295 = get_local_size(0);
    wave_sizze_39294 = LOCKSTEP_WIDTH;
    group_id_36346 = get_group_id(0);
    
    int32_t gtid_36337;
    
    gtid_36337 = global_tid_36344;
    
    int32_t x_36662;
    int32_t y_36664;
    bool res_36665;
    int32_t res_36666;
    bool cond_36667;
    int32_t res_36668;
    bool cond_36674;
    bool res_36675;
    bool x_36676;
    bool y_36677;
    bool cond_36678;
    int32_t res_36679;
    
    if (slt32(gtid_36337, sizze_31215)) {
        x_36662 = *(__global int32_t *) &res_mem_38340[gtid_36337 * 4];
        y_36664 = *(__global int32_t *) &mem_38437[gtid_36337 * 4];
        res_36665 = *(__global bool *) &mem_38443[gtid_36337];
        res_36666 = *(__global int32_t *) &mem_38459[gtid_36337 * 4];
        cond_36667 = !res_36665;
        if (cond_36667) {
            res_36668 = -1;
        } else {
            bool cond_36669;
            int32_t res_36670;
            
            cond_36669 = slt32(res_36666, y_36664);
            if (cond_36669) {
                int32_t i_36671;
                int32_t x_36672;
                int32_t res_36673;
                
                i_36671 = x_36662 + res_36666;
                x_36672 = *(__global int32_t *) &res_mem_38291[(gtid_36337 *
                                                                sizze_31214 +
                                                                i_36671) * 4];
                res_36673 = x_36672 - n_31219;
                res_36670 = res_36673;
            } else {
                res_36670 = -1;
            }
            res_36668 = res_36670;
        }
        cond_36674 = sle32(x_36662, 5);
        res_36675 = sle32(y_36664, 5);
        x_36676 = !cond_36674;
        y_36677 = res_36675 && x_36676;
        cond_36678 = cond_36674 || y_36677;
        if (cond_36678) {
            res_36679 = -2;
        } else {
            res_36679 = res_36668;
        }
    }
    if (slt32(gtid_36337, sizze_31215)) {
        *(__global int32_t *) &mem_38462[gtid_36337 * 4] = res_36679;
    }
}
__kernel void map_36377(int32_t sizze_31215, __global unsigned char *mem_38443,
                        __global unsigned char *mem_38446, __global
                        unsigned char *mem_38459)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_36377;
    int32_t local_tid_36378;
    int32_t group_sizze_39293;
    int32_t wave_sizze_39292;
    int32_t group_id_36379;
    
    global_tid_36377 = get_global_id(0);
    local_tid_36378 = get_local_id(0);
    group_sizze_39293 = get_local_size(0);
    wave_sizze_39292 = LOCKSTEP_WIDTH;
    group_id_36379 = get_group_id(0);
    
    int32_t gtid_36370;
    
    gtid_36370 = global_tid_36377;
    
    bool acc0_36641;
    int32_t acc0_36642;
    int32_t res_36649;
    
    if (slt32(gtid_36370, sizze_31215)) {
        acc0_36641 = *(__global bool *) &mem_38443[gtid_36370];
        acc0_36642 = *(__global int32_t *) &mem_38446[gtid_36370 * 4];
        if (acc0_36641) {
            res_36649 = acc0_36642;
        } else {
            res_36649 = -1;
        }
    }
    if (slt32(gtid_36370, sizze_31215)) {
        *(__global int32_t *) &mem_38459[gtid_36370 * 4] = res_36649;
    }
}
__kernel void map_36485(int32_t sizze_31215, __global
                        unsigned char *res_mem_38289, __global
                        unsigned char *res_mem_38340, __global
                        unsigned char *res_mem_38341, __global
                        unsigned char *mem_38434, __global
                        unsigned char *mem_38437)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t global_tid_36485;
    int32_t local_tid_36486;
    int32_t group_sizze_39181;
    int32_t wave_sizze_39180;
    int32_t group_id_36487;
    
    global_tid_36485 = get_global_id(0);
    local_tid_36486 = get_local_id(0);
    group_sizze_39181 = get_local_size(0);
    wave_sizze_39180 = LOCKSTEP_WIDTH;
    group_id_36487 = get_group_id(0);
    
    int32_t gtid_36478;
    
    gtid_36478 = global_tid_36485;
    
    int32_t x_36511;
    float x_36512;
    int32_t copy_p_36513;
    int32_t y_36514;
    float res_36515;
    float res_36516;
    float y_36517;
    
    if (slt32(gtid_36478, sizze_31215)) {
        x_36511 = *(__global int32_t *) &res_mem_38340[gtid_36478 * 4];
        x_36512 = *(__global float *) &res_mem_38341[gtid_36478 * 4];
        copy_p_36513 = *(__global int32_t *) &res_mem_38289[gtid_36478 * 4];
        y_36514 = copy_p_36513 - x_36511;
        res_36515 = sitofp_i32_f32(x_36511);
        res_36516 = futrts_sqrt32(res_36515);
        y_36517 = x_36512 * res_36516;
    }
    if (slt32(gtid_36478, sizze_31215)) {
        *(__global float *) &mem_38434[gtid_36478 * 4] = y_36517;
    }
    if (slt32(gtid_36478, sizze_31215)) {
        *(__global int32_t *) &mem_38437[gtid_36478 * 4] = y_36514;
    }
}
__kernel void map_intra_group_32493(__local volatile
                                    int64_t *mem_37943_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t sizze_31216, int32_t n_31219,
                                    int32_t res_31237,
                                    int32_t computed_group_sizze_32491, __global
                                    unsigned char *images_mem_37894, __global
                                    unsigned char *arg_mem_37903, __global
                                    unsigned char *mem_37911, __global
                                    unsigned char *mem_37939, __global
                                    unsigned char *mem_37948)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_37943_backing_0 =
                          mem_37943_backing_aligned_0;
    int32_t global_tid_32493;
    int32_t local_tid_32494;
    int32_t group_sizze_38593;
    int32_t wave_sizze_38592;
    int32_t group_id_32495;
    
    global_tid_32493 = get_global_id(0);
    local_tid_32494 = get_local_id(0);
    group_sizze_38593 = get_local_size(0);
    wave_sizze_38592 = LOCKSTEP_WIDTH;
    group_id_32495 = get_group_id(0);
    
    int32_t gtid_32479;
    int32_t ltid_32480;
    
    gtid_32479 = squot32(global_tid_32493, computed_group_sizze_32491);
    ltid_32480 = global_tid_32493 - squot32(global_tid_32493,
                                            computed_group_sizze_32491) *
        computed_group_sizze_32491;
    if (slt32(gtid_32479, sizze_31215) && slt32(ltid_32480,
                                                computed_group_sizze_32491)) { }
    
    __local char *mem_37943;
    
    mem_37943 = (__local char *) mem_37943_backing_0;
    for (int32_t comb_iter_38594 = 0; comb_iter_38594 < squot32(res_31237 +
                                                                computed_group_sizze_32491 -
                                                                1,
                                                                computed_group_sizze_32491);
         comb_iter_38594++) {
        int32_t ctid_32487;
        int32_t flat_comb_id_38595 = comb_iter_38594 *
                computed_group_sizze_32491 + local_tid_32494;
        
        ctid_32487 = flat_comb_id_38595;
        if (slt32(ctid_32487, res_31237) && 1) {
            for (int32_t i_32586 = 0; i_32586 < res_31237; i_32586++) {
                float res_32588;
                float redout_32589 = 0.0F;
                
                for (int32_t i_32590 = 0; i_32590 < n_31219; i_32590++) {
                    float x_32591;
                    float x_32592;
                    float x_32593;
                    float x_32594;
                    bool res_32595;
                    float y_32596;
                    float res_32597;
                    float res_32600;
                    
                    x_32591 = *(__global
                                float *) &images_mem_37894[(gtid_32479 *
                                                            sizze_31216 +
                                                            i_32590) * 4];
                    x_32592 = *(__global float *) &arg_mem_37903[(ltid_32480 *
                                                                  sizze_31214 +
                                                                  i_32590) * 4];
                    x_32593 = *(__global float *) &mem_37911[(i_32590 *
                                                              res_31237 +
                                                              i_32586) * 4];
                    x_32594 = x_32592 * x_32593;
                    res_32595 = futrts_isnan32(x_32591);
                    if (res_32595) {
                        y_32596 = 0.0F;
                    } else {
                        y_32596 = 1.0F;
                    }
                    res_32597 = x_32594 * y_32596;
                    res_32600 = redout_32589 + res_32597;
                    
                    float redout_tmp_38597 = res_32600;
                    
                    redout_32589 = redout_tmp_38597;
                }
                res_32588 = redout_32589;
                *(__global float *) &mem_37939[(group_id_32495 *
                                                (computed_group_sizze_32491 *
                                                 res_31237) + local_tid_32494 +
                                                i_32586 *
                                                computed_group_sizze_32491) *
                                               4] = res_32588;
            }
            for (int32_t i_38598 = 0; i_38598 < res_31237; i_38598++) {
                *(__local float *) &mem_37943[(ctid_32487 * res_31237 +
                                               i_38598) * 4] = *(__global
                                                                 float *) &mem_37939[(group_id_32495 *
                                                                                      (computed_group_sizze_32491 *
                                                                                       res_31237) +
                                                                                      local_tid_32494 +
                                                                                      i_38598 *
                                                                                      computed_group_sizze_32491) *
                                                                                     4];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_38599 = 0; i_38599 < squot32(res_31237 * res_31237 -
                                                local_tid_32494 +
                                                computed_group_sizze_32491 - 1,
                                                computed_group_sizze_32491);
         i_38599++) {
        *(__global float *) &mem_37948[(group_id_32495 * (res_31237 *
                                                          res_31237) +
                                        squot32(i_38599 *
                                                computed_group_sizze_32491 +
                                                local_tid_32494, res_31237) *
                                        res_31237 + (i_38599 *
                                                     computed_group_sizze_32491 +
                                                     local_tid_32494 -
                                                     squot32(i_38599 *
                                                             computed_group_sizze_32491 +
                                                             local_tid_32494,
                                                             res_31237) *
                                                     res_31237)) * 4] =
            *(__local float *) &mem_37943[(squot32(i_38599 *
                                                   computed_group_sizze_32491 +
                                                   local_tid_32494, res_31237) *
                                           res_31237 + (i_38599 *
                                                        computed_group_sizze_32491 +
                                                        local_tid_32494 -
                                                        squot32(i_38599 *
                                                                computed_group_sizze_32491 +
                                                                local_tid_32494,
                                                                res_31237) *
                                                        res_31237)) * 4];
    }
}
__kernel void map_intra_group_32641(__local volatile
                                    int64_t *mem_37965_backing_aligned_0,
                                    int32_t sizze_31215, int32_t sizze_31216,
                                    int32_t n_31219, int32_t res_31237, __global
                                    unsigned char *images_mem_37894, __global
                                    unsigned char *mem_37907, __global
                                    unsigned char *mem_37911, __global
                                    unsigned char *mem_37969)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_37965_backing_0 =
                          mem_37965_backing_aligned_0;
    int32_t global_tid_32641;
    int32_t local_tid_32642;
    int32_t group_sizze_38606;
    int32_t wave_sizze_38605;
    int32_t group_id_32643;
    
    global_tid_32641 = get_global_id(0);
    local_tid_32642 = get_local_id(0);
    group_sizze_38606 = get_local_size(0);
    wave_sizze_38605 = LOCKSTEP_WIDTH;
    group_id_32643 = get_group_id(0);
    
    int32_t gtid_32622;
    int32_t gtid_32623;
    int32_t ltid_32625;
    
    gtid_32622 = squot32(global_tid_32641, res_31237 * res_31237);
    gtid_32623 = squot32(global_tid_32641 - squot32(global_tid_32641,
                                                    res_31237 * res_31237) *
                         (res_31237 * res_31237), res_31237);
    ltid_32625 = global_tid_32641 - squot32(global_tid_32641, res_31237 *
                                            res_31237) * (res_31237 *
                                                          res_31237) -
        squot32(global_tid_32641 - squot32(global_tid_32641, res_31237 *
                                           res_31237) * (res_31237 * res_31237),
                res_31237) * res_31237;
    
    float x_36781;
    
    if ((slt32(gtid_32622, sizze_31215) && slt32(gtid_32623, res_31237)) &&
        slt32(ltid_32625, res_31237)) {
        float x_33015 = 0.0F;
        
        for (int32_t chunk_offset_33014 = 0; chunk_offset_33014 < n_31219;
             chunk_offset_33014++) {
            float x_33026;
            float x_33027;
            float x_33028;
            float x_33030;
            bool res_33031;
            float y_33032;
            float res_33033;
            float res_33035;
            
            x_33026 = *(__global float *) &images_mem_37894[(gtid_32622 *
                                                             sizze_31216 +
                                                             chunk_offset_33014) *
                                                            4];
            x_33027 = *(__global float *) &mem_37907[(chunk_offset_33014 *
                                                      res_31237 + gtid_32623) *
                                                     4];
            x_33028 = *(__global float *) &mem_37911[(chunk_offset_33014 *
                                                      res_31237 + ltid_32625) *
                                                     4];
            x_33030 = x_33027 * x_33028;
            res_33031 = futrts_isnan32(x_33026);
            if (res_33031) {
                y_33032 = 0.0F;
            } else {
                y_33032 = 1.0F;
            }
            res_33033 = x_33030 * y_33032;
            res_33035 = x_33015 + res_33033;
            
            float x_tmp_38607 = res_33035;
            
            x_33015 = x_tmp_38607;
        }
        x_36781 = x_33015;
    }
    
    __local char *mem_37965;
    
    mem_37965 = (__local char *) mem_37965_backing_0;
    for (int32_t comb_iter_38608 = 0; comb_iter_38608 < 1; comb_iter_38608++) {
        int32_t ctid_32639;
        int32_t flat_comb_id_38609 = comb_iter_38608 * res_31237 +
                local_tid_32642;
        
        ctid_32639 = flat_comb_id_38609;
        if (slt32(ctid_32639, res_31237) && 1) {
            *(__local float *) &mem_37965[ctid_32639 * 4] = x_36781;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_38610 = 0; i_38610 < squot32(res_31237 - local_tid_32642 +
                                                res_31237 - 1, res_31237);
         i_38610++) {
        *(__global float *) &mem_37969[(group_id_32643 * res_31237 + (i_38610 *
                                                                      res_31237 +
                                                                      local_tid_32642)) *
                                       4] = *(__local
                                              float *) &mem_37965[(i_38610 *
                                                                   res_31237 +
                                                                   local_tid_32642) *
                                                                  4];
    }
}
__kernel void map_intra_group_32738(__local volatile
                                    int64_t *mem_37988_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t sizze_31216, int32_t n_31219,
                                    int32_t res_31237, __global
                                    unsigned char *images_mem_37894, __global
                                    unsigned char *arg_mem_37903, __global
                                    unsigned char *mem_37985, __global
                                    unsigned char *mem_37991)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_37988_backing_0 =
                          mem_37988_backing_aligned_0;
    int32_t global_tid_32738;
    int32_t local_tid_32739;
    int32_t group_sizze_38646;
    int32_t wave_sizze_38645;
    int32_t group_id_32740;
    
    global_tid_32738 = get_global_id(0);
    local_tid_32739 = get_local_id(0);
    group_sizze_38646 = get_local_size(0);
    wave_sizze_38645 = LOCKSTEP_WIDTH;
    group_id_32740 = get_group_id(0);
    
    int32_t gtid_32729;
    int32_t gtid_32730;
    int32_t gtid_32731;
    int32_t ltid_32734;
    
    gtid_32729 = squot32(global_tid_32738, res_31237 * res_31237 * n_31219);
    gtid_32730 = squot32(global_tid_32738 - squot32(global_tid_32738,
                                                    res_31237 * res_31237 *
                                                    n_31219) * (res_31237 *
                                                                res_31237 *
                                                                n_31219),
                         res_31237 * n_31219);
    gtid_32731 = squot32(global_tid_32738 - squot32(global_tid_32738,
                                                    res_31237 * res_31237 *
                                                    n_31219) * (res_31237 *
                                                                res_31237 *
                                                                n_31219) -
                         squot32(global_tid_32738 - squot32(global_tid_32738,
                                                            res_31237 *
                                                            res_31237 *
                                                            n_31219) *
                                 (res_31237 * res_31237 * n_31219), res_31237 *
                                 n_31219) * (res_31237 * n_31219), n_31219);
    ltid_32734 = global_tid_32738 - squot32(global_tid_32738, res_31237 *
                                            res_31237 * n_31219) * (res_31237 *
                                                                    res_31237 *
                                                                    n_31219) -
        squot32(global_tid_32738 - squot32(global_tid_32738, res_31237 *
                                           res_31237 * n_31219) * (res_31237 *
                                                                   res_31237 *
                                                                   n_31219),
                res_31237 * n_31219) * (res_31237 * n_31219) -
        squot32(global_tid_32738 - squot32(global_tid_32738, res_31237 *
                                           res_31237 * n_31219) * (res_31237 *
                                                                   res_31237 *
                                                                   n_31219) -
                squot32(global_tid_32738 - squot32(global_tid_32738, res_31237 *
                                                   res_31237 * n_31219) *
                        (res_31237 * res_31237 * n_31219), res_31237 *
                        n_31219) * (res_31237 * n_31219), n_31219) * n_31219;
    
    float x_36817;
    float x_36819;
    float x_36821;
    float x_33090;
    bool res_33091;
    float y_33092;
    float res_33093;
    
    if (((slt32(gtid_32729, sizze_31215) && slt32(gtid_32730, res_31237)) &&
         slt32(gtid_32731, res_31237)) && slt32(ltid_32734, n_31219)) {
        x_36817 = *(__global float *) &images_mem_37894[(gtid_32729 *
                                                         sizze_31216 +
                                                         ltid_32734) * 4];
        x_36819 = *(__global float *) &arg_mem_37903[(gtid_32730 * sizze_31214 +
                                                      ltid_32734) * 4];
        x_36821 = *(__global float *) &mem_37985[(gtid_32731 * sizze_31214 +
                                                  ltid_32734) * 4];
        x_33090 = x_36819 * x_36821;
        res_33091 = futrts_isnan32(x_36817);
        if (res_33091) {
            y_33092 = 0.0F;
        } else {
            y_33092 = 1.0F;
        }
        res_33093 = x_33090 * y_33092;
    }
    
    __local char *mem_37988;
    float res_33094;
    
    mem_37988 = (__local char *) mem_37988_backing_0;
    for (int32_t comb_iter_38647 = 0; comb_iter_38647 < 1; comb_iter_38647++) {
        int32_t ctid_32736;
        int32_t flat_comb_id_38648 = comb_iter_38647 * n_31219 +
                local_tid_32739;
        
        ctid_32736 = flat_comb_id_38648;
        if (slt32(ctid_32736, n_31219) && 1) {
            *(__local float *) &mem_37988[ctid_32736 * 4] = res_33093;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38649;
    int32_t skip_waves_38650;
    float x_33095;
    float x_33096;
    
    offset_38649 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_32739, n_31219)) {
            x_33095 = *(__local float *) &mem_37988[(local_tid_32739 +
                                                     offset_38649) * 4];
        }
    }
    offset_38649 = 1;
    while (slt32(offset_38649, wave_sizze_38645)) {
        if (slt32(local_tid_32739 + offset_38649, n_31219) &&
            ((local_tid_32739 - squot32(local_tid_32739, wave_sizze_38645) *
              wave_sizze_38645) & (2 * offset_38649 - 1)) == 0) {
            // read array element
            {
                x_33096 = *(volatile __local
                            float *) &mem_37988[(local_tid_32739 +
                                                 offset_38649) * 4];
            }
            // apply reduction operation
            {
                float res_33097;
                
                if (((slt32(gtid_32729, sizze_31215) && slt32(gtid_32730,
                                                              res_31237)) &&
                     slt32(gtid_32731, res_31237)) && slt32(ltid_32734,
                                                            n_31219)) {
                    res_33097 = x_33095 + x_33096;
                }
                x_33095 = res_33097;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_37988[local_tid_32739 * 4] =
                    x_33095;
            }
        }
        offset_38649 *= 2;
    }
    skip_waves_38650 = 1;
    while (slt32(skip_waves_38650, squot32(n_31219 + wave_sizze_38645 - 1,
                                           wave_sizze_38645))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38649 = skip_waves_38650 * wave_sizze_38645;
        if (slt32(local_tid_32739 + offset_38649, n_31219) &&
            ((local_tid_32739 - squot32(local_tid_32739, wave_sizze_38645) *
              wave_sizze_38645) == 0 && (squot32(local_tid_32739,
                                                 wave_sizze_38645) & (2 *
                                                                      skip_waves_38650 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_33096 = *(__local float *) &mem_37988[(local_tid_32739 +
                                                         offset_38649) * 4];
            }
            // apply reduction operation
            {
                float res_33097;
                
                if (((slt32(gtid_32729, sizze_31215) && slt32(gtid_32730,
                                                              res_31237)) &&
                     slt32(gtid_32731, res_31237)) && slt32(ltid_32734,
                                                            n_31219)) {
                    res_33097 = x_33095 + x_33096;
                }
                x_33095 = res_33097;
            }
            // write result of operation
            {
                *(__local float *) &mem_37988[local_tid_32739 * 4] = x_33095;
            }
        }
        skip_waves_38650 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_33094 = *(__local float *) &mem_37988[0];
    if (local_tid_32739 == 0) {
        *(__global float *) &mem_37991[group_id_32740 * 4] = res_33094;
    }
}
__kernel void map_intra_group_33149(__local volatile
                                    int64_t *mem_38009_backing_aligned_0,
                                    __local volatile
                                    int64_t *mem_38013_backing_aligned_1,
                                    int32_t sizze_31215, int32_t res_31237,
                                    int32_t m_31319, int32_t j_31369,
                                    int32_t j_m_i_31370, int32_t res_31373,
                                    __global unsigned char *res_mem_38006,
                                    __global unsigned char *mem_38019)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38009_backing_0 =
                          mem_38009_backing_aligned_0;
    __local volatile char *restrict mem_38013_backing_1 =
                          mem_38013_backing_aligned_1;
    int32_t global_tid_33149;
    int32_t local_tid_33150;
    int32_t group_sizze_38688;
    int32_t wave_sizze_38687;
    int32_t group_id_33151;
    
    global_tid_33149 = get_global_id(0);
    local_tid_33150 = get_local_id(0);
    group_sizze_38688 = get_local_size(0);
    wave_sizze_38687 = LOCKSTEP_WIDTH;
    group_id_33151 = get_group_id(0);
    
    int32_t gtid_33143;
    int32_t ltid_33144;
    
    gtid_33143 = squot32(global_tid_33149, res_31373);
    ltid_33144 = global_tid_33149 - squot32(global_tid_33149, res_31373) *
        res_31373;
    
    int32_t x_36846;
    int32_t x_36848;
    bool cond_33318;
    float x_36850;
    
    if (slt32(gtid_33143, sizze_31215) && slt32(ltid_33144, res_31373)) {
        x_36846 = sdiv32(ltid_33144, j_31369);
        x_36848 = smod32(ltid_33144, j_31369);
        cond_33318 = slt32(x_36848, res_31237);
        if (cond_33318) {
            float res_33320 = *(__global float *) &res_mem_38006[(gtid_33143 *
                                                                  (res_31237 *
                                                                   res_31237) +
                                                                  x_36846 *
                                                                  res_31237 +
                                                                  x_36848) * 4];
            
            x_36850 = res_33320;
        } else {
            int32_t y_33321;
            bool cond_33322;
            float res_33323;
            
            y_33321 = res_31237 + x_36846;
            cond_33322 = x_36848 == y_33321;
            if (cond_33322) {
                res_33323 = 1.0F;
            } else {
                res_33323 = 0.0F;
            }
            x_36850 = res_33323;
        }
    }
    
    __local char *mem_38009;
    __local char *mem_38013;
    
    mem_38009 = (__local char *) mem_38009_backing_0;
    for (int32_t comb_iter_38689 = 0; comb_iter_38689 < 1; comb_iter_38689++) {
        int32_t ctid_33145;
        int32_t flat_comb_id_38690 = comb_iter_38689 * res_31373 +
                local_tid_33150;
        
        ctid_33145 = flat_comb_id_38690;
        if (slt32(ctid_33145, res_31373) && 1) {
            *(__local float *) &mem_38009[ctid_33145 * 4] = x_36850;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    mem_38013 = (__local char *) mem_38013_backing_1;
    if (slt32(gtid_33143, sizze_31215) && slt32(ltid_33144, res_31373)) {
        for (int32_t i_33326 = 0; i_33326 < res_31237; i_33326++) {
            float res_33327;
            bool cond_33328;
            float x_36857;
            
            res_33327 = *(__local float *) &mem_38009[i_33326 * 4];
            cond_33328 = res_33327 == 0.0F;
            if (cond_33328) {
                int32_t x_33334;
                int32_t i_33335;
                float res_33336;
                
                x_33334 = j_31369 * x_36846;
                i_33335 = x_33334 + x_36848;
                res_33336 = *(__local float *) &mem_38009[i_33335 * 4];
                x_36857 = res_33336;
            } else {
                float x_33337;
                float res_33338;
                bool cond_33339;
                float res_33340;
                
                x_33337 = *(__local float *) &mem_38009[x_36848 * 4];
                res_33338 = x_33337 / res_33327;
                cond_33339 = slt32(x_36846, m_31319);
                if (cond_33339) {
                    int32_t x_33341;
                    int32_t x_33342;
                    int32_t i_33343;
                    float x_33344;
                    int32_t i_33345;
                    float x_33346;
                    float y_33347;
                    float res_33348;
                    
                    x_33341 = 1 + x_36846;
                    x_33342 = j_31369 * x_33341;
                    i_33343 = x_33342 + x_36848;
                    x_33344 = *(__local float *) &mem_38009[i_33343 * 4];
                    i_33345 = i_33326 + x_33342;
                    x_33346 = *(__local float *) &mem_38009[i_33345 * 4];
                    y_33347 = res_33338 * x_33346;
                    res_33348 = x_33344 - y_33347;
                    res_33340 = res_33348;
                } else {
                    res_33340 = res_33338;
                }
                x_36857 = res_33340;
            }
            for (int32_t comb_iter_38692 = 0; comb_iter_38692 < 1;
                 comb_iter_38692++) {
                int32_t ctid_33146;
                int32_t flat_comb_id_38693 = comb_iter_38692 * res_31373 +
                        local_tid_33150;
                
                ctid_33146 = flat_comb_id_38693;
                if (slt32(ctid_33146, res_31373) && 1) {
                    *(__local float *) &mem_38013[ctid_33146 * 4] = x_36857;
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            
            float x_36860 = *(__local float *) &mem_38013[ltid_33144 * 4];
            
            for (int32_t comb_iter_38694 = 0; comb_iter_38694 < 1;
                 comb_iter_38694++) {
                int32_t ctid_33147;
                int32_t flat_comb_id_38695 = comb_iter_38694 * res_31373 +
                        local_tid_33150;
                
                ctid_33147 = flat_comb_id_38695;
                if (slt32(ctid_33147, res_31373) && 1) {
                    if (sle32(0, ltid_33144) && slt32(ltid_33144, res_31373)) {
                        *(__local float *) &mem_38009[ltid_33144 * 4] = x_36860;
                    }
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
    for (int32_t i_38696 = 0; i_38696 < squot32(res_31237 * j_m_i_31370 -
                                                local_tid_33150 + res_31373 - 1,
                                                res_31373); i_38696++) {
        *(__global float *) &mem_38019[(group_id_33151 * (j_m_i_31370 *
                                                          res_31237) +
                                        squot32(i_38696 * res_31373 +
                                                local_tid_33150, j_m_i_31370) *
                                        j_m_i_31370 + (i_38696 * res_31373 +
                                                       local_tid_33150 -
                                                       squot32(i_38696 *
                                                               res_31373 +
                                                               local_tid_33150,
                                                               j_m_i_31370) *
                                                       j_m_i_31370)) * 4] =
            *(__local float *) &mem_38009[(res_31237 + (squot32(i_38696 *
                                                                res_31373 +
                                                                local_tid_33150,
                                                                j_m_i_31370) *
                                                        j_31369 + (i_38696 *
                                                                   res_31373 +
                                                                   local_tid_33150 -
                                                                   squot32(i_38696 *
                                                                           res_31373 +
                                                                           local_tid_33150,
                                                                           j_m_i_31370) *
                                                                   j_m_i_31370))) *
                                          4];
    }
}
__kernel void map_intra_group_33844(__local volatile
                                    int64_t *mem_38056_backing_aligned_0,
                                    int32_t sizze_31215, int32_t sizze_31216,
                                    int32_t n_31219, int32_t res_31237, __global
                                    unsigned char *images_mem_37894, __global
                                    unsigned char *mem_37907, __global
                                    unsigned char *mem_38060)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38056_backing_0 =
                          mem_38056_backing_aligned_0;
    int32_t global_tid_33844;
    int32_t local_tid_33845;
    int32_t group_sizze_38717;
    int32_t wave_sizze_38716;
    int32_t group_id_33846;
    
    global_tid_33844 = get_global_id(0);
    local_tid_33845 = get_local_id(0);
    group_sizze_38717 = get_local_size(0);
    wave_sizze_38716 = LOCKSTEP_WIDTH;
    group_id_33846 = get_group_id(0);
    
    int32_t gtid_33829;
    int32_t ltid_33830;
    
    gtid_33829 = squot32(global_tid_33844, res_31237);
    ltid_33830 = global_tid_33844 - squot32(global_tid_33844, res_31237) *
        res_31237;
    
    float x_36914;
    
    if (slt32(gtid_33829, sizze_31215) && slt32(ltid_33830, res_31237)) {
        float x_33917 = 0.0F;
        
        for (int32_t chunk_offset_33916 = 0; chunk_offset_33916 < n_31219;
             chunk_offset_33916++) {
            float x_33926;
            float x_33927;
            bool res_33929;
            float res_33930;
            float res_33933;
            
            x_33926 = *(__global float *) &mem_37907[(chunk_offset_33916 *
                                                      res_31237 + ltid_33830) *
                                                     4];
            x_33927 = *(__global float *) &images_mem_37894[(gtid_33829 *
                                                             sizze_31216 +
                                                             chunk_offset_33916) *
                                                            4];
            res_33929 = futrts_isnan32(x_33927);
            if (res_33929) {
                res_33930 = 0.0F;
            } else {
                float res_33931 = x_33926 * x_33927;
                
                res_33930 = res_33931;
            }
            res_33933 = x_33917 + res_33930;
            
            float x_tmp_38718 = res_33933;
            
            x_33917 = x_tmp_38718;
        }
        x_36914 = x_33917;
    }
    
    __local char *mem_38056;
    
    mem_38056 = (__local char *) mem_38056_backing_0;
    for (int32_t comb_iter_38719 = 0; comb_iter_38719 < 1; comb_iter_38719++) {
        int32_t ctid_33842;
        int32_t flat_comb_id_38720 = comb_iter_38719 * res_31237 +
                local_tid_33845;
        
        ctid_33842 = flat_comb_id_38720;
        if (slt32(ctid_33842, res_31237) && 1) {
            *(__local float *) &mem_38056[ctid_33842 * 4] = x_36914;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_38721 = 0; i_38721 < squot32(res_31237 - local_tid_33845 +
                                                res_31237 - 1, res_31237);
         i_38721++) {
        *(__global float *) &mem_38060[(group_id_33846 * res_31237 + (i_38721 *
                                                                      res_31237 +
                                                                      local_tid_33845)) *
                                       4] = *(__local
                                              float *) &mem_38056[(i_38721 *
                                                                   res_31237 +
                                                                   local_tid_33845) *
                                                                  4];
    }
}
__kernel void map_intra_group_33966(__local volatile
                                    int64_t *mem_38075_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t sizze_31216, int32_t n_31219,
                                    int32_t res_31237, __global
                                    unsigned char *images_mem_37894, __global
                                    unsigned char *arg_mem_37903, __global
                                    unsigned char *mem_38078)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38075_backing_0 =
                          mem_38075_backing_aligned_0;
    int32_t global_tid_33966;
    int32_t local_tid_33967;
    int32_t group_sizze_38731;
    int32_t wave_sizze_38730;
    int32_t group_id_33968;
    
    global_tid_33966 = get_global_id(0);
    local_tid_33967 = get_local_id(0);
    group_sizze_38731 = get_local_size(0);
    wave_sizze_38730 = LOCKSTEP_WIDTH;
    group_id_33968 = get_group_id(0);
    
    int32_t gtid_33959;
    int32_t gtid_33960;
    int32_t ltid_33962;
    
    gtid_33959 = squot32(global_tid_33966, res_31237 * n_31219);
    gtid_33960 = squot32(global_tid_33966 - squot32(global_tid_33966,
                                                    res_31237 * n_31219) *
                         (res_31237 * n_31219), n_31219);
    ltid_33962 = global_tid_33966 - squot32(global_tid_33966, res_31237 *
                                            n_31219) * (res_31237 * n_31219) -
        squot32(global_tid_33966 - squot32(global_tid_33966, res_31237 *
                                           n_31219) * (res_31237 * n_31219),
                n_31219) * n_31219;
    
    float x_36940;
    float x_36942;
    bool res_34120;
    float res_34121;
    
    if ((slt32(gtid_33959, sizze_31215) && slt32(gtid_33960, res_31237)) &&
        slt32(ltid_33962, n_31219)) {
        x_36940 = *(__global float *) &arg_mem_37903[(gtid_33960 * sizze_31214 +
                                                      ltid_33962) * 4];
        x_36942 = *(__global float *) &images_mem_37894[(gtid_33959 *
                                                         sizze_31216 +
                                                         ltid_33962) * 4];
        res_34120 = futrts_isnan32(x_36942);
        if (res_34120) {
            res_34121 = 0.0F;
        } else {
            float res_34122 = x_36940 * x_36942;
            
            res_34121 = res_34122;
        }
    }
    
    __local char *mem_38075;
    float res_34123;
    
    mem_38075 = (__local char *) mem_38075_backing_0;
    for (int32_t comb_iter_38732 = 0; comb_iter_38732 < 1; comb_iter_38732++) {
        int32_t ctid_33964;
        int32_t flat_comb_id_38733 = comb_iter_38732 * n_31219 +
                local_tid_33967;
        
        ctid_33964 = flat_comb_id_38733;
        if (slt32(ctid_33964, n_31219) && 1) {
            *(__local float *) &mem_38075[ctid_33964 * 4] = res_34121;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38734;
    int32_t skip_waves_38735;
    float x_34124;
    float x_34125;
    
    offset_38734 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_33967, n_31219)) {
            x_34124 = *(__local float *) &mem_38075[(local_tid_33967 +
                                                     offset_38734) * 4];
        }
    }
    offset_38734 = 1;
    while (slt32(offset_38734, wave_sizze_38730)) {
        if (slt32(local_tid_33967 + offset_38734, n_31219) &&
            ((local_tid_33967 - squot32(local_tid_33967, wave_sizze_38730) *
              wave_sizze_38730) & (2 * offset_38734 - 1)) == 0) {
            // read array element
            {
                x_34125 = *(volatile __local
                            float *) &mem_38075[(local_tid_33967 +
                                                 offset_38734) * 4];
            }
            // apply reduction operation
            {
                float res_34126;
                
                if ((slt32(gtid_33959, sizze_31215) && slt32(gtid_33960,
                                                             res_31237)) &&
                    slt32(ltid_33962, n_31219)) {
                    res_34126 = x_34124 + x_34125;
                }
                x_34124 = res_34126;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_38075[local_tid_33967 * 4] =
                    x_34124;
            }
        }
        offset_38734 *= 2;
    }
    skip_waves_38735 = 1;
    while (slt32(skip_waves_38735, squot32(n_31219 + wave_sizze_38730 - 1,
                                           wave_sizze_38730))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38734 = skip_waves_38735 * wave_sizze_38730;
        if (slt32(local_tid_33967 + offset_38734, n_31219) &&
            ((local_tid_33967 - squot32(local_tid_33967, wave_sizze_38730) *
              wave_sizze_38730) == 0 && (squot32(local_tid_33967,
                                                 wave_sizze_38730) & (2 *
                                                                      skip_waves_38735 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_34125 = *(__local float *) &mem_38075[(local_tid_33967 +
                                                         offset_38734) * 4];
            }
            // apply reduction operation
            {
                float res_34126;
                
                if ((slt32(gtid_33959, sizze_31215) && slt32(gtid_33960,
                                                             res_31237)) &&
                    slt32(ltid_33962, n_31219)) {
                    res_34126 = x_34124 + x_34125;
                }
                x_34124 = res_34126;
            }
            // write result of operation
            {
                *(__local float *) &mem_38075[local_tid_33967 * 4] = x_34124;
            }
        }
        skip_waves_38735 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_34123 = *(__local float *) &mem_38075[0];
    if (local_tid_33967 == 0) {
        *(__global float *) &mem_38078[group_id_33968 * 4] = res_34123;
    }
}
__kernel void map_intra_group_34180(__local volatile
                                    int64_t *mem_38115_backing_aligned_0,
                                    int32_t sizze_31215, int32_t res_31237,
                                    int32_t j_m_i_31370, __global
                                    unsigned char *res_mem_38086, __global
                                    unsigned char *mem_38112, __global
                                    unsigned char *mem_38119)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38115_backing_0 =
                          mem_38115_backing_aligned_0;
    int32_t global_tid_34180;
    int32_t local_tid_34181;
    int32_t group_sizze_38783;
    int32_t wave_sizze_38782;
    int32_t group_id_34182;
    
    global_tid_34180 = get_global_id(0);
    local_tid_34181 = get_local_id(0);
    group_sizze_38783 = get_local_size(0);
    wave_sizze_38782 = LOCKSTEP_WIDTH;
    group_id_34182 = get_group_id(0);
    
    int32_t gtid_34165;
    int32_t ltid_34166;
    
    gtid_34165 = squot32(global_tid_34180, res_31237);
    ltid_34166 = global_tid_34180 - squot32(global_tid_34180, res_31237) *
        res_31237;
    
    float x_36970;
    
    if (slt32(gtid_34165, sizze_31215) && slt32(ltid_34166, res_31237)) {
        float x_34255 = 0.0F;
        
        for (int32_t chunk_offset_34254 = 0; chunk_offset_34254 < j_m_i_31370;
             chunk_offset_34254++) {
            float x_34264;
            float x_34265;
            float res_34267;
            float res_34269;
            
            x_34264 = *(__global float *) &res_mem_38086[(gtid_34165 *
                                                          res_31237 +
                                                          chunk_offset_34254) *
                                                         4];
            x_34265 = *(__global float *) &mem_38112[(chunk_offset_34254 *
                                                      (res_31237 *
                                                       sizze_31215) +
                                                      gtid_34165 * res_31237 +
                                                      ltid_34166) * 4];
            res_34267 = x_34264 * x_34265;
            res_34269 = x_34255 + res_34267;
            
            float x_tmp_38784 = res_34269;
            
            x_34255 = x_tmp_38784;
        }
        x_36970 = x_34255;
    }
    
    __local char *mem_38115;
    
    mem_38115 = (__local char *) mem_38115_backing_0;
    for (int32_t comb_iter_38785 = 0; comb_iter_38785 < 1; comb_iter_38785++) {
        int32_t ctid_34178;
        int32_t flat_comb_id_38786 = comb_iter_38785 * res_31237 +
                local_tid_34181;
        
        ctid_34178 = flat_comb_id_38786;
        if (slt32(ctid_34178, res_31237) && 1) {
            *(__local float *) &mem_38115[ctid_34178 * 4] = x_36970;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_38787 = 0; i_38787 < squot32(res_31237 - local_tid_34181 +
                                                res_31237 - 1, res_31237);
         i_38787++) {
        *(__global float *) &mem_38119[(group_id_34182 * res_31237 + (i_38787 *
                                                                      res_31237 +
                                                                      local_tid_34181)) *
                                       4] = *(__local
                                              float *) &mem_38115[(i_38787 *
                                                                   res_31237 +
                                                                   local_tid_34181) *
                                                                  4];
    }
}
__kernel void map_intra_group_34303(__local volatile
                                    int64_t *mem_38131_backing_aligned_0,
                                    int32_t sizze_31215, int32_t res_31237,
                                    int32_t j_m_i_31370, __global
                                    unsigned char *res_mem_38037, __global
                                    unsigned char *res_mem_38086, __global
                                    unsigned char *mem_38134)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38131_backing_0 =
                          mem_38131_backing_aligned_0;
    int32_t global_tid_34303;
    int32_t local_tid_34304;
    int32_t group_sizze_38792;
    int32_t wave_sizze_38791;
    int32_t group_id_34305;
    
    global_tid_34303 = get_global_id(0);
    local_tid_34304 = get_local_id(0);
    group_sizze_38792 = get_local_size(0);
    wave_sizze_38791 = LOCKSTEP_WIDTH;
    group_id_34305 = get_group_id(0);
    
    int32_t gtid_34296;
    int32_t gtid_34297;
    int32_t ltid_34299;
    
    gtid_34296 = squot32(global_tid_34303, res_31237 * j_m_i_31370);
    gtid_34297 = squot32(global_tid_34303 - squot32(global_tid_34303,
                                                    res_31237 * j_m_i_31370) *
                         (res_31237 * j_m_i_31370), j_m_i_31370);
    ltid_34299 = global_tid_34303 - squot32(global_tid_34303, res_31237 *
                                            j_m_i_31370) * (res_31237 *
                                                            j_m_i_31370) -
        squot32(global_tid_34303 - squot32(global_tid_34303, res_31237 *
                                           j_m_i_31370) * (res_31237 *
                                                           j_m_i_31370),
                j_m_i_31370) * j_m_i_31370;
    
    int32_t binop_x_36980;
    int32_t binop_x_36981;
    int32_t new_index_36982;
    int32_t binop_y_36988;
    int32_t new_index_36989;
    float x_36976;
    float x_36978;
    float res_34449;
    
    if ((slt32(gtid_34296, sizze_31215) && slt32(gtid_34297, res_31237)) &&
        slt32(ltid_34299, j_m_i_31370)) {
        binop_x_36980 = j_m_i_31370 * gtid_34296;
        binop_x_36981 = ltid_34299 + binop_x_36980;
        new_index_36982 = squot32(binop_x_36981, res_31237);
        binop_y_36988 = res_31237 * new_index_36982;
        new_index_36989 = binop_x_36981 - binop_y_36988;
        x_36976 = *(__global float *) &res_mem_38086[(new_index_36982 *
                                                      res_31237 +
                                                      new_index_36989) * 4];
        x_36978 = *(__global float *) &res_mem_38037[(gtid_34296 *
                                                      (j_m_i_31370 *
                                                       res_31237) + gtid_34297 *
                                                      j_m_i_31370 +
                                                      ltid_34299) * 4];
        res_34449 = x_36976 * x_36978;
    }
    
    __local char *mem_38131;
    float res_34450;
    
    mem_38131 = (__local char *) mem_38131_backing_0;
    for (int32_t comb_iter_38793 = 0; comb_iter_38793 < 1; comb_iter_38793++) {
        int32_t ctid_34301;
        int32_t flat_comb_id_38794 = comb_iter_38793 * j_m_i_31370 +
                local_tid_34304;
        
        ctid_34301 = flat_comb_id_38794;
        if (slt32(ctid_34301, j_m_i_31370) && 1) {
            *(__local float *) &mem_38131[ctid_34301 * 4] = res_34449;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38795;
    int32_t skip_waves_38796;
    float x_34451;
    float x_34452;
    
    offset_38795 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_34304, j_m_i_31370)) {
            x_34451 = *(__local float *) &mem_38131[(local_tid_34304 +
                                                     offset_38795) * 4];
        }
    }
    offset_38795 = 1;
    while (slt32(offset_38795, wave_sizze_38791)) {
        if (slt32(local_tid_34304 + offset_38795, j_m_i_31370) &&
            ((local_tid_34304 - squot32(local_tid_34304, wave_sizze_38791) *
              wave_sizze_38791) & (2 * offset_38795 - 1)) == 0) {
            // read array element
            {
                x_34452 = *(volatile __local
                            float *) &mem_38131[(local_tid_34304 +
                                                 offset_38795) * 4];
            }
            // apply reduction operation
            {
                float res_34453;
                
                if ((slt32(gtid_34296, sizze_31215) && slt32(gtid_34297,
                                                             res_31237)) &&
                    slt32(ltid_34299, j_m_i_31370)) {
                    res_34453 = x_34451 + x_34452;
                }
                x_34451 = res_34453;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_38131[local_tid_34304 * 4] =
                    x_34451;
            }
        }
        offset_38795 *= 2;
    }
    skip_waves_38796 = 1;
    while (slt32(skip_waves_38796, squot32(j_m_i_31370 + wave_sizze_38791 - 1,
                                           wave_sizze_38791))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38795 = skip_waves_38796 * wave_sizze_38791;
        if (slt32(local_tid_34304 + offset_38795, j_m_i_31370) &&
            ((local_tid_34304 - squot32(local_tid_34304, wave_sizze_38791) *
              wave_sizze_38791) == 0 && (squot32(local_tid_34304,
                                                 wave_sizze_38791) & (2 *
                                                                      skip_waves_38796 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_34452 = *(__local float *) &mem_38131[(local_tid_34304 +
                                                         offset_38795) * 4];
            }
            // apply reduction operation
            {
                float res_34453;
                
                if ((slt32(gtid_34296, sizze_31215) && slt32(gtid_34297,
                                                             res_31237)) &&
                    slt32(ltid_34299, j_m_i_31370)) {
                    res_34453 = x_34451 + x_34452;
                }
                x_34451 = res_34453;
            }
            // write result of operation
            {
                *(__local float *) &mem_38131[local_tid_34304 * 4] = x_34451;
            }
        }
        skip_waves_38796 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_34450 = *(__local float *) &mem_38131[0];
    if (local_tid_34304 == 0) {
        *(__global float *) &mem_38134[group_id_34305 * 4] = res_34450;
    }
}
__kernel void map_intra_group_34505(__local volatile
                                    int64_t *mem_38165_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t res_31237, __global
                                    unsigned char *res_mem_38142, __global
                                    unsigned char *mem_38162, __global
                                    unsigned char *mem_38169)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38165_backing_0 =
                          mem_38165_backing_aligned_0;
    int32_t global_tid_34505;
    int32_t local_tid_34506;
    int32_t group_sizze_38839;
    int32_t wave_sizze_38838;
    int32_t group_id_34507;
    
    global_tid_34505 = get_global_id(0);
    local_tid_34506 = get_local_id(0);
    group_sizze_38839 = get_local_size(0);
    wave_sizze_38838 = LOCKSTEP_WIDTH;
    group_id_34507 = get_group_id(0);
    
    int32_t gtid_34490;
    int32_t ltid_34491;
    
    gtid_34490 = squot32(global_tid_34505, sizze_31214);
    ltid_34491 = global_tid_34505 - squot32(global_tid_34505, sizze_31214) *
        sizze_31214;
    
    float x_37000;
    
    if (slt32(gtid_34490, sizze_31215) && slt32(ltid_34491, sizze_31214)) {
        float x_34574 = 0.0F;
        
        for (int32_t chunk_offset_34573 = 0; chunk_offset_34573 < res_31237;
             chunk_offset_34573++) {
            float x_34583;
            float x_34584;
            float res_34586;
            float res_34588;
            
            x_34583 = *(__global float *) &res_mem_38142[(gtid_34490 *
                                                          res_31237 +
                                                          chunk_offset_34573) *
                                                         4];
            x_34584 = *(__global float *) &mem_38162[(chunk_offset_34573 *
                                                      sizze_31214 +
                                                      ltid_34491) * 4];
            res_34586 = x_34583 * x_34584;
            res_34588 = x_34574 + res_34586;
            
            float x_tmp_38840 = res_34588;
            
            x_34574 = x_tmp_38840;
        }
        x_37000 = x_34574;
    }
    
    __local char *mem_38165;
    
    mem_38165 = (__local char *) mem_38165_backing_0;
    for (int32_t comb_iter_38841 = 0; comb_iter_38841 < 1; comb_iter_38841++) {
        int32_t ctid_34503;
        int32_t flat_comb_id_38842 = comb_iter_38841 * sizze_31214 +
                local_tid_34506;
        
        ctid_34503 = flat_comb_id_38842;
        if (slt32(ctid_34503, sizze_31214) && 1) {
            *(__local float *) &mem_38165[ctid_34503 * 4] = x_37000;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_38843 = 0; i_38843 < squot32(sizze_31214 - local_tid_34506 +
                                                sizze_31214 - 1, sizze_31214);
         i_38843++) {
        *(__global float *) &mem_38169[(group_id_34507 * sizze_31214 +
                                        (i_38843 * sizze_31214 +
                                         local_tid_34506)) * 4] = *(__local
                                                                    float *) &mem_38165[(i_38843 *
                                                                                         sizze_31214 +
                                                                                         local_tid_34506) *
                                                                                        4];
    }
}
__kernel void map_intra_group_34621(__local volatile
                                    int64_t *mem_38188_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t res_31237, __global
                                    unsigned char *mem_37911, __global
                                    unsigned char *res_mem_38142, __global
                                    unsigned char *mem_38191)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38188_backing_0 =
                          mem_38188_backing_aligned_0;
    int32_t global_tid_34621;
    int32_t local_tid_34622;
    int32_t group_sizze_38853;
    int32_t wave_sizze_38852;
    int32_t group_id_34623;
    
    global_tid_34621 = get_global_id(0);
    local_tid_34622 = get_local_id(0);
    group_sizze_38853 = get_local_size(0);
    wave_sizze_38852 = LOCKSTEP_WIDTH;
    group_id_34623 = get_group_id(0);
    
    int32_t gtid_34614;
    int32_t gtid_34615;
    int32_t ltid_34617;
    
    gtid_34614 = squot32(global_tid_34621, sizze_31214 * res_31237);
    gtid_34615 = squot32(global_tid_34621 - squot32(global_tid_34621,
                                                    sizze_31214 * res_31237) *
                         (sizze_31214 * res_31237), res_31237);
    ltid_34617 = global_tid_34621 - squot32(global_tid_34621, sizze_31214 *
                                            res_31237) * (sizze_31214 *
                                                          res_31237) -
        squot32(global_tid_34621 - squot32(global_tid_34621, sizze_31214 *
                                           res_31237) * (sizze_31214 *
                                                         res_31237),
                res_31237) * res_31237;
    
    float x_37006;
    float x_37008;
    float res_34767;
    
    if ((slt32(gtid_34614, sizze_31215) && slt32(gtid_34615, sizze_31214)) &&
        slt32(ltid_34617, res_31237)) {
        x_37006 = *(__global float *) &res_mem_38142[(gtid_34614 * res_31237 +
                                                      ltid_34617) * 4];
        x_37008 = *(__global float *) &mem_37911[(gtid_34615 * res_31237 +
                                                  ltid_34617) * 4];
        res_34767 = x_37006 * x_37008;
    }
    
    __local char *mem_38188;
    float res_34768;
    
    mem_38188 = (__local char *) mem_38188_backing_0;
    for (int32_t comb_iter_38854 = 0; comb_iter_38854 < 1; comb_iter_38854++) {
        int32_t ctid_34619;
        int32_t flat_comb_id_38855 = comb_iter_38854 * res_31237 +
                local_tid_34622;
        
        ctid_34619 = flat_comb_id_38855;
        if (slt32(ctid_34619, res_31237) && 1) {
            *(__local float *) &mem_38188[ctid_34619 * 4] = res_34767;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38856;
    int32_t skip_waves_38857;
    float x_34769;
    float x_34770;
    
    offset_38856 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_34622, res_31237)) {
            x_34769 = *(__local float *) &mem_38188[(local_tid_34622 +
                                                     offset_38856) * 4];
        }
    }
    offset_38856 = 1;
    while (slt32(offset_38856, wave_sizze_38852)) {
        if (slt32(local_tid_34622 + offset_38856, res_31237) &&
            ((local_tid_34622 - squot32(local_tid_34622, wave_sizze_38852) *
              wave_sizze_38852) & (2 * offset_38856 - 1)) == 0) {
            // read array element
            {
                x_34770 = *(volatile __local
                            float *) &mem_38188[(local_tid_34622 +
                                                 offset_38856) * 4];
            }
            // apply reduction operation
            {
                float res_34771;
                
                if ((slt32(gtid_34614, sizze_31215) && slt32(gtid_34615,
                                                             sizze_31214)) &&
                    slt32(ltid_34617, res_31237)) {
                    res_34771 = x_34769 + x_34770;
                }
                x_34769 = res_34771;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_38188[local_tid_34622 * 4] =
                    x_34769;
            }
        }
        offset_38856 *= 2;
    }
    skip_waves_38857 = 1;
    while (slt32(skip_waves_38857, squot32(res_31237 + wave_sizze_38852 - 1,
                                           wave_sizze_38852))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38856 = skip_waves_38857 * wave_sizze_38852;
        if (slt32(local_tid_34622 + offset_38856, res_31237) &&
            ((local_tid_34622 - squot32(local_tid_34622, wave_sizze_38852) *
              wave_sizze_38852) == 0 && (squot32(local_tid_34622,
                                                 wave_sizze_38852) & (2 *
                                                                      skip_waves_38857 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_34770 = *(__local float *) &mem_38188[(local_tid_34622 +
                                                         offset_38856) * 4];
            }
            // apply reduction operation
            {
                float res_34771;
                
                if ((slt32(gtid_34614, sizze_31215) && slt32(gtid_34615,
                                                             sizze_31214)) &&
                    slt32(ltid_34617, res_31237)) {
                    res_34771 = x_34769 + x_34770;
                }
                x_34769 = res_34771;
            }
            // write result of operation
            {
                *(__local float *) &mem_38188[local_tid_34622 * 4] = x_34769;
            }
        }
        skip_waves_38857 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_34768 = *(__local float *) &mem_38188[0];
    if (local_tid_34622 == 0) {
        *(__global float *) &mem_38191[group_id_34623 * 4] = res_34768;
    }
}
__kernel void map_intra_group_34818(__local volatile
                                    int64_t *mem_38246_backing_aligned_0,
                                    __local volatile
                                    int64_t *mem_38249_backing_aligned_1,
                                    __local volatile
                                    int64_t *mem_38252_backing_aligned_2,
                                    __local volatile
                                    int64_t *mem_38255_backing_aligned_3,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t sizze_31216, int32_t i_31490,
                                    __global unsigned char *images_mem_37894,
                                    __global unsigned char *res_mem_38199,
                                    __global unsigned char *mem_38258, __global
                                    unsigned char *mem_38262, __global
                                    unsigned char *mem_38266)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38246_backing_0 =
                          mem_38246_backing_aligned_0;
    __local volatile char *restrict mem_38249_backing_1 =
                          mem_38249_backing_aligned_1;
    __local volatile char *restrict mem_38252_backing_2 =
                          mem_38252_backing_aligned_2;
    __local volatile char *restrict mem_38255_backing_3 =
                          mem_38255_backing_aligned_3;
    int32_t global_tid_34818;
    int32_t local_tid_34819;
    int32_t group_sizze_38910;
    int32_t wave_sizze_38909;
    int32_t group_id_34820;
    
    global_tid_34818 = get_global_id(0);
    local_tid_34819 = get_local_id(0);
    group_sizze_38910 = get_local_size(0);
    wave_sizze_38909 = LOCKSTEP_WIDTH;
    group_id_34820 = get_group_id(0);
    
    int32_t gtid_34808;
    int32_t ltid_34809;
    
    gtid_34808 = squot32(global_tid_34818, sizze_31214);
    ltid_34809 = global_tid_34818 - squot32(global_tid_34818, sizze_31214) *
        sizze_31214;
    
    float x_37027;
    float x_37029;
    bool res_34992;
    bool cond_34993;
    float res_34994;
    bool res_34996;
    bool res_34997;
    int32_t res_34998;
    
    if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809, sizze_31214)) {
        x_37027 = *(__global float *) &images_mem_37894[(gtid_34808 *
                                                         sizze_31216 +
                                                         ltid_34809) * 4];
        x_37029 = *(__global float *) &res_mem_38199[(gtid_34808 * sizze_31214 +
                                                      ltid_34809) * 4];
        res_34992 = futrts_isnan32(x_37027);
        cond_34993 = !res_34992;
        if (cond_34993) {
            float res_34995 = x_37027 - x_37029;
            
            res_34994 = res_34995;
        } else {
            res_34994 = NAN;
        }
        res_34996 = futrts_isnan32(res_34994);
        res_34997 = !res_34996;
        if (res_34997) {
            res_34998 = 1;
        } else {
            res_34998 = 0;
        }
    }
    
    __local char *mem_38246;
    __local char *mem_38249;
    
    mem_38246 = (__local char *) mem_38246_backing_0;
    mem_38249 = (__local char *) mem_38249_backing_1;
    for (int32_t comb_iter_38911 = 0; comb_iter_38911 < 1; comb_iter_38911++) {
        int32_t ctid_34811;
        int32_t flat_comb_id_38912 = comb_iter_38911 * sizze_31214 +
                local_tid_34819;
        
        ctid_34811 = flat_comb_id_38912;
        if (slt32(ctid_34811, sizze_31214) && 1) {
            *(__local int32_t *) &mem_38246[ctid_34811 * 4] = res_34998;
            *(__local float *) &mem_38249[ctid_34811 * 4] = res_34994;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t x_35000;
    int32_t x_35001;
    int32_t x_38913;
    int32_t x_38914;
    int32_t skip_threads_38916;
    
    if (slt32(local_tid_34819, sizze_31214)) {
        x_35001 = *(volatile __local int32_t *) &mem_38246[local_tid_34819 *
                                                           sizeof(int32_t)];
    }
    // in-block scan (hopefully no barriers needed)
    {
        skip_threads_38916 = 1;
        while (slt32(skip_threads_38916, 32)) {
            if (sle32(skip_threads_38916, local_tid_34819 -
                      squot32(local_tid_34819, 32) * 32) &&
                slt32(local_tid_34819, sizze_31214)) {
                // read operands
                {
                    x_35000 = *(volatile __local
                                int32_t *) &mem_38246[(local_tid_34819 -
                                                       skip_threads_38916) *
                                                      sizeof(int32_t)];
                }
                // perform operation
                {
                    int32_t res_35002;
                    
                    if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809,
                                                                sizze_31214)) {
                        res_35002 = x_35000 + x_35001;
                    }
                    x_35001 = res_35002;
                }
            }
            if (sle32(wave_sizze_38909, skip_threads_38916)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            if (sle32(skip_threads_38916, local_tid_34819 -
                      squot32(local_tid_34819, 32) * 32) &&
                slt32(local_tid_34819, sizze_31214)) {
                // write result
                {
                    *(volatile __local int32_t *) &mem_38246[local_tid_34819 *
                                                             sizeof(int32_t)] =
                        x_35001;
                }
            }
            if (sle32(wave_sizze_38909, skip_threads_38916)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            skip_threads_38916 *= 2;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // last thread of block 'i' writes its result to offset 'i'
    {
        if ((local_tid_34819 - squot32(local_tid_34819, 32) * 32) == 31 &&
            slt32(local_tid_34819, sizze_31214)) {
            *(volatile __local int32_t *) &mem_38246[squot32(local_tid_34819,
                                                             32) *
                                                     sizeof(int32_t)] = x_35001;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
    {
        int32_t skip_threads_38917;
        
        if (squot32(local_tid_34819, 32) == 0 && slt32(local_tid_34819,
                                                       sizze_31214)) {
            x_38914 = *(volatile __local int32_t *) &mem_38246[local_tid_34819 *
                                                               sizeof(int32_t)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_38917 = 1;
            while (slt32(skip_threads_38917, 32)) {
                if (sle32(skip_threads_38917, local_tid_34819 -
                          squot32(local_tid_34819, 32) * 32) &&
                    (squot32(local_tid_34819, 32) == 0 && slt32(local_tid_34819,
                                                                sizze_31214))) {
                    // read operands
                    {
                        x_38913 = *(volatile __local
                                    int32_t *) &mem_38246[(local_tid_34819 -
                                                           skip_threads_38917) *
                                                          sizeof(int32_t)];
                    }
                    // perform operation
                    {
                        int32_t res_38915;
                        
                        if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809,
                                                                    sizze_31214)) {
                            res_38915 = x_38913 + x_38914;
                        }
                        x_38914 = res_38915;
                    }
                }
                if (sle32(wave_sizze_38909, skip_threads_38917)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_38917, local_tid_34819 -
                          squot32(local_tid_34819, 32) * 32) &&
                    (squot32(local_tid_34819, 32) == 0 && slt32(local_tid_34819,
                                                                sizze_31214))) {
                    // write result
                    {
                        *(volatile __local
                          int32_t *) &mem_38246[local_tid_34819 *
                                                sizeof(int32_t)] = x_38914;
                    }
                }
                if (sle32(wave_sizze_38909, skip_threads_38917)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_38917 *= 2;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // carry-in for every block except the first
    {
        if (!(squot32(local_tid_34819, 32) == 0 || !slt32(local_tid_34819,
                                                          sizze_31214))) {
            // read operands
            {
                x_35000 = *(volatile __local
                            int32_t *) &mem_38246[(squot32(local_tid_34819,
                                                           32) - 1) *
                                                  sizeof(int32_t)];
            }
            // perform operation
            {
                int32_t res_35002;
                
                if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809,
                                                            sizze_31214)) {
                    res_35002 = x_35000 + x_35001;
                }
                x_35001 = res_35002;
            }
            // write final result
            {
                *(volatile __local int32_t *) &mem_38246[local_tid_34819 *
                                                         sizeof(int32_t)] =
                    x_35001;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // restore correct values for first block
    {
        if (squot32(local_tid_34819, 32) == 0) {
            *(volatile __local int32_t *) &mem_38246[local_tid_34819 *
                                                     sizeof(int32_t)] = x_35001;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t res_35003;
    
    if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809, sizze_31214)) {
        res_35003 = *(__local int32_t *) &mem_38246[i_31490 * 4];
    }
    
    __local char *mem_38252;
    __local char *mem_38255;
    
    mem_38252 = (__local char *) mem_38252_backing_2;
    for (int32_t comb_iter_38918 = 0; comb_iter_38918 < 1; comb_iter_38918++) {
        int32_t new_local_index_34812;
        int32_t flat_comb_id_38919 = comb_iter_38918 * sizze_31214 +
                local_tid_34819;
        
        new_local_index_34812 = flat_comb_id_38919;
        if (slt32(new_local_index_34812, sizze_31214) && 1) {
            *(__local float *) &mem_38252[new_local_index_34812 * 4] = NAN;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    mem_38255 = (__local char *) mem_38255_backing_3;
    for (int32_t comb_iter_38920 = 0; comb_iter_38920 < 1; comb_iter_38920++) {
        int32_t new_local_index_34814;
        int32_t flat_comb_id_38921 = comb_iter_38920 * sizze_31214 +
                local_tid_34819;
        
        new_local_index_34814 = flat_comb_id_38921;
        if (slt32(new_local_index_34814, sizze_31214) && 1) {
            *(__local int32_t *) &mem_38255[new_local_index_34814 * 4] = 0;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    float x_37032;
    int32_t x_37034;
    bool res_35013;
    bool res_35014;
    int32_t res_35015;
    
    if (slt32(gtid_34808, sizze_31215) && slt32(ltid_34809, sizze_31214)) {
        x_37032 = *(__local float *) &mem_38249[ltid_34809 * 4];
        x_37034 = *(__local int32_t *) &mem_38246[ltid_34809 * 4];
        res_35013 = futrts_isnan32(x_37032);
        res_35014 = !res_35013;
        if (res_35014) {
            int32_t res_35016 = x_37034 - 1;
            
            res_35015 = res_35016;
        } else {
            res_35015 = -1;
        }
    }
    for (int32_t comb_iter_38922 = 0; comb_iter_38922 < 1; comb_iter_38922++) {
        int32_t ctid_34816;
        int32_t flat_comb_id_38923 = comb_iter_38922 * sizze_31214 +
                local_tid_34819;
        
        ctid_34816 = flat_comb_id_38923;
        if (slt32(ctid_34816, sizze_31214) && 1) {
            if (sle32(0, res_35015) && slt32(res_35015, sizze_31214)) {
                *(__local int32_t *) &mem_38255[res_35015 * 4] = ltid_34809;
            }
            if (sle32(0, res_35015) && slt32(res_35015, sizze_31214)) {
                *(__local float *) &mem_38252[res_35015 * 4] = x_37032;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (local_tid_34819 == 0) {
        *(__global int32_t *) &mem_38258[group_id_34820 * 4] = res_35003;
    }
    for (int32_t i_38925 = 0; i_38925 < squot32(sizze_31214 - local_tid_34819 +
                                                sizze_31214 - 1, sizze_31214);
         i_38925++) {
        *(__global float *) &mem_38262[(group_id_34820 * sizze_31214 +
                                        (i_38925 * sizze_31214 +
                                         local_tid_34819)) * 4] = *(__local
                                                                    float *) &mem_38252[(i_38925 *
                                                                                         sizze_31214 +
                                                                                         local_tid_34819) *
                                                                                        4];
    }
    for (int32_t i_38926 = 0; i_38926 < squot32(sizze_31214 - local_tid_34819 +
                                                sizze_31214 - 1, sizze_31214);
         i_38926++) {
        *(__global int32_t *) &mem_38266[(group_id_34820 * sizze_31214 +
                                          (i_38926 * sizze_31214 +
                                           local_tid_34819)) * 4] = *(__local
                                                                      int32_t *) &mem_38255[(i_38926 *
                                                                                             sizze_31214 +
                                                                                             local_tid_34819) *
                                                                                            4];
    }
}
__kernel void map_intra_group_35256(__local volatile
                                    int64_t *mem_38311_backing_aligned_0,
                                    __local volatile
                                    int64_t *mem_38314_backing_aligned_1,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t sizze_31216, int32_t n_31219,
                                    float hfrac_31221, int32_t res_31235,
                                    __global unsigned char *images_mem_37894,
                                    __global unsigned char *res_mem_38290,
                                    __global unsigned char *mem_38317, __global
                                    unsigned char *mem_38320, __global
                                    unsigned char *mem_38323)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38311_backing_0 =
                          mem_38311_backing_aligned_0;
    __local volatile char *restrict mem_38314_backing_1 =
                          mem_38314_backing_aligned_1;
    int32_t global_tid_35256;
    int32_t local_tid_35257;
    int32_t group_sizze_38990;
    int32_t wave_sizze_38989;
    int32_t group_id_35258;
    
    global_tid_35256 = get_global_id(0);
    local_tid_35257 = get_local_id(0);
    group_sizze_38990 = get_local_size(0);
    wave_sizze_38989 = LOCKSTEP_WIDTH;
    group_id_35258 = get_group_id(0);
    
    int32_t gtid_35249;
    int32_t ltid_35250;
    
    gtid_35249 = squot32(global_tid_35256, n_31219);
    ltid_35250 = global_tid_35256 - squot32(global_tid_35256, n_31219) *
        n_31219;
    
    float x_37062;
    bool res_35397;
    bool cond_35398;
    int32_t res_35399;
    
    if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250, n_31219)) {
        x_37062 = *(__global float *) &images_mem_37894[(gtid_35249 *
                                                         sizze_31216 +
                                                         ltid_35250) * 4];
        res_35397 = futrts_isnan32(x_37062);
        cond_35398 = !res_35397;
        if (cond_35398) {
            res_35399 = 1;
        } else {
            res_35399 = 0;
        }
    }
    
    __local char *mem_38311;
    int32_t res_35400;
    
    mem_38311 = (__local char *) mem_38311_backing_0;
    for (int32_t comb_iter_38991 = 0; comb_iter_38991 < 1; comb_iter_38991++) {
        int32_t ctid_35252;
        int32_t flat_comb_id_38992 = comb_iter_38991 * n_31219 +
                local_tid_35257;
        
        ctid_35252 = flat_comb_id_38992;
        if (slt32(ctid_35252, n_31219) && 1) {
            *(__local int32_t *) &mem_38311[ctid_35252 * 4] = res_35399;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38993;
    int32_t skip_waves_38994;
    int32_t x_35401;
    int32_t x_35402;
    
    offset_38993 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35257, n_31219)) {
            x_35401 = *(__local int32_t *) &mem_38311[(local_tid_35257 +
                                                       offset_38993) * 4];
        }
    }
    offset_38993 = 1;
    while (slt32(offset_38993, wave_sizze_38989)) {
        if (slt32(local_tid_35257 + offset_38993, n_31219) &&
            ((local_tid_35257 - squot32(local_tid_35257, wave_sizze_38989) *
              wave_sizze_38989) & (2 * offset_38993 - 1)) == 0) {
            // read array element
            {
                x_35402 = *(volatile __local
                            int32_t *) &mem_38311[(local_tid_35257 +
                                                   offset_38993) * 4];
            }
            // apply reduction operation
            {
                int32_t res_35403;
                
                if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250,
                                                            n_31219)) {
                    res_35403 = x_35401 + x_35402;
                }
                x_35401 = res_35403;
            }
            // write result of operation
            {
                *(volatile __local int32_t *) &mem_38311[local_tid_35257 * 4] =
                    x_35401;
            }
        }
        offset_38993 *= 2;
    }
    skip_waves_38994 = 1;
    while (slt32(skip_waves_38994, squot32(n_31219 + wave_sizze_38989 - 1,
                                           wave_sizze_38989))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38993 = skip_waves_38994 * wave_sizze_38989;
        if (slt32(local_tid_35257 + offset_38993, n_31219) &&
            ((local_tid_35257 - squot32(local_tid_35257, wave_sizze_38989) *
              wave_sizze_38989) == 0 && (squot32(local_tid_35257,
                                                 wave_sizze_38989) & (2 *
                                                                      skip_waves_38994 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_35402 = *(__local int32_t *) &mem_38311[(local_tid_35257 +
                                                           offset_38993) * 4];
            }
            // apply reduction operation
            {
                int32_t res_35403;
                
                if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250,
                                                            n_31219)) {
                    res_35403 = x_35401 + x_35402;
                }
                x_35401 = res_35403;
            }
            // write result of operation
            {
                *(__local int32_t *) &mem_38311[local_tid_35257 * 4] = x_35401;
            }
        }
        skip_waves_38994 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_35400 = *(__local int32_t *) &mem_38311[0];
    
    bool cond_35406;
    float x_37067;
    float res_35409;
    
    if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250, n_31219)) {
        cond_35406 = slt32(ltid_35250, res_35400);
        if (cond_35406) {
            float res_35408 = *(__global float *) &res_mem_38290[(gtid_35249 *
                                                                  sizze_31214 +
                                                                  ltid_35250) *
                                                                 4];
            
            x_37067 = res_35408;
        } else {
            x_37067 = 0.0F;
        }
        res_35409 = x_37067 * x_37067;
    }
    
    __local char *mem_38314;
    float res_35410;
    
    mem_38314 = (__local char *) mem_38314_backing_1;
    for (int32_t comb_iter_38995 = 0; comb_iter_38995 < 1; comb_iter_38995++) {
        int32_t ctid_35254;
        int32_t flat_comb_id_38996 = comb_iter_38995 * n_31219 +
                local_tid_35257;
        
        ctid_35254 = flat_comb_id_38996;
        if (slt32(ctid_35254, n_31219) && 1) {
            *(__local float *) &mem_38314[ctid_35254 * 4] = res_35409;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38997;
    int32_t skip_waves_38998;
    float x_35411;
    float x_35412;
    
    offset_38997 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35257, n_31219)) {
            x_35411 = *(__local float *) &mem_38314[(local_tid_35257 +
                                                     offset_38997) * 4];
        }
    }
    offset_38997 = 1;
    while (slt32(offset_38997, wave_sizze_38989)) {
        if (slt32(local_tid_35257 + offset_38997, n_31219) &&
            ((local_tid_35257 - squot32(local_tid_35257, wave_sizze_38989) *
              wave_sizze_38989) & (2 * offset_38997 - 1)) == 0) {
            // read array element
            {
                x_35412 = *(volatile __local
                            float *) &mem_38314[(local_tid_35257 +
                                                 offset_38997) * 4];
            }
            // apply reduction operation
            {
                float res_35413;
                
                if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250,
                                                            n_31219)) {
                    res_35413 = x_35411 + x_35412;
                }
                x_35411 = res_35413;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_38314[local_tid_35257 * 4] =
                    x_35411;
            }
        }
        offset_38997 *= 2;
    }
    skip_waves_38998 = 1;
    while (slt32(skip_waves_38998, squot32(n_31219 + wave_sizze_38989 - 1,
                                           wave_sizze_38989))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38997 = skip_waves_38998 * wave_sizze_38989;
        if (slt32(local_tid_35257 + offset_38997, n_31219) &&
            ((local_tid_35257 - squot32(local_tid_35257, wave_sizze_38989) *
              wave_sizze_38989) == 0 && (squot32(local_tid_35257,
                                                 wave_sizze_38989) & (2 *
                                                                      skip_waves_38998 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_35412 = *(__local float *) &mem_38314[(local_tid_35257 +
                                                         offset_38997) * 4];
            }
            // apply reduction operation
            {
                float res_35413;
                
                if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250,
                                                            n_31219)) {
                    res_35413 = x_35411 + x_35412;
                }
                x_35411 = res_35413;
            }
            // write result of operation
            {
                *(__local float *) &mem_38314[local_tid_35257 * 4] = x_35411;
            }
        }
        skip_waves_38998 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_35410 = *(__local float *) &mem_38314[0];
    
    int32_t arg_35414;
    float res_35415;
    float arg_35416;
    float res_35417;
    float res_35418;
    float arg_35419;
    int32_t res_35420;
    
    if (slt32(gtid_35249, sizze_31215) && slt32(ltid_35250, n_31219)) {
        arg_35414 = res_35400 - res_31235;
        res_35415 = sitofp_i32_f32(arg_35414);
        arg_35416 = res_35410 / res_35415;
        res_35417 = futrts_sqrt32(arg_35416);
        res_35418 = sitofp_i32_f32(res_35400);
        arg_35419 = hfrac_31221 * res_35418;
        res_35420 = fptosi_f32_i32(arg_35419);
    }
    if (local_tid_35257 == 0) {
        *(__global int32_t *) &mem_38317[group_id_35258 * 4] = res_35420;
    }
    if (local_tid_35257 == 0) {
        *(__global int32_t *) &mem_38320[group_id_35258 * 4] = res_35400;
    }
    if (local_tid_35257 == 0) {
        *(__global float *) &mem_38323[group_id_35258 * 4] = res_35417;
    }
}
__kernel void map_intra_group_35604(__local volatile
                                    int64_t *mem_38350_backing_aligned_0,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t res_31594, __global
                                    unsigned char *res_mem_38290, __global
                                    unsigned char *res_mem_38339, __global
                                    unsigned char *res_mem_38340, __global
                                    unsigned char *mem_38353)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38350_backing_0 =
                          mem_38350_backing_aligned_0;
    int32_t global_tid_35604;
    int32_t local_tid_35605;
    int32_t group_sizze_39100;
    int32_t wave_sizze_39099;
    int32_t group_id_35606;
    
    global_tid_35604 = get_global_id(0);
    local_tid_35605 = get_local_id(0);
    group_sizze_39100 = get_local_size(0);
    wave_sizze_39099 = LOCKSTEP_WIDTH;
    group_id_35606 = get_group_id(0);
    
    int32_t gtid_35599;
    int32_t ltid_35600;
    
    gtid_35599 = squot32(global_tid_35604, res_31594);
    ltid_35600 = global_tid_35604 - squot32(global_tid_35604, res_31594) *
        res_31594;
    
    int32_t x_35685;
    int32_t x_35686;
    bool cond_35689;
    float x_37087;
    
    if (slt32(gtid_35599, sizze_31215) && slt32(ltid_35600, res_31594)) {
        x_35685 = *(__global int32_t *) &res_mem_38340[gtid_35599 * 4];
        x_35686 = *(__global int32_t *) &res_mem_38339[gtid_35599 * 4];
        cond_35689 = slt32(ltid_35600, x_35686);
        if (cond_35689) {
            int32_t x_35691;
            int32_t x_35692;
            int32_t i_35693;
            float res_35694;
            
            x_35691 = ltid_35600 + x_35685;
            x_35692 = x_35691 - x_35686;
            i_35693 = 1 + x_35692;
            res_35694 = *(__global float *) &res_mem_38290[(gtid_35599 *
                                                            sizze_31214 +
                                                            i_35693) * 4];
            x_37087 = res_35694;
        } else {
            x_37087 = 0.0F;
        }
    }
    
    __local char *mem_38350;
    float res_35695;
    
    mem_38350 = (__local char *) mem_38350_backing_0;
    for (int32_t comb_iter_39101 = 0; comb_iter_39101 < 1; comb_iter_39101++) {
        int32_t ctid_35602;
        int32_t flat_comb_id_39102 = comb_iter_39101 * res_31594 +
                local_tid_35605;
        
        ctid_35602 = flat_comb_id_39102;
        if (slt32(ctid_35602, res_31594) && 1) {
            *(__local float *) &mem_38350[ctid_35602 * 4] = x_37087;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39103;
    int32_t skip_waves_39104;
    float x_35696;
    float x_35697;
    
    offset_39103 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35605, res_31594)) {
            x_35696 = *(__local float *) &mem_38350[(local_tid_35605 +
                                                     offset_39103) * 4];
        }
    }
    offset_39103 = 1;
    while (slt32(offset_39103, wave_sizze_39099)) {
        if (slt32(local_tid_35605 + offset_39103, res_31594) &&
            ((local_tid_35605 - squot32(local_tid_35605, wave_sizze_39099) *
              wave_sizze_39099) & (2 * offset_39103 - 1)) == 0) {
            // read array element
            {
                x_35697 = *(volatile __local
                            float *) &mem_38350[(local_tid_35605 +
                                                 offset_39103) * 4];
            }
            // apply reduction operation
            {
                float res_35698;
                
                if (slt32(gtid_35599, sizze_31215) && slt32(ltid_35600,
                                                            res_31594)) {
                    res_35698 = x_35696 + x_35697;
                }
                x_35696 = res_35698;
            }
            // write result of operation
            {
                *(volatile __local float *) &mem_38350[local_tid_35605 * 4] =
                    x_35696;
            }
        }
        offset_39103 *= 2;
    }
    skip_waves_39104 = 1;
    while (slt32(skip_waves_39104, squot32(res_31594 + wave_sizze_39099 - 1,
                                           wave_sizze_39099))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39103 = skip_waves_39104 * wave_sizze_39099;
        if (slt32(local_tid_35605 + offset_39103, res_31594) &&
            ((local_tid_35605 - squot32(local_tid_35605, wave_sizze_39099) *
              wave_sizze_39099) == 0 && (squot32(local_tid_35605,
                                                 wave_sizze_39099) & (2 *
                                                                      skip_waves_39104 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_35697 = *(__local float *) &mem_38350[(local_tid_35605 +
                                                         offset_39103) * 4];
            }
            // apply reduction operation
            {
                float res_35698;
                
                if (slt32(gtid_35599, sizze_31215) && slt32(ltid_35600,
                                                            res_31594)) {
                    res_35698 = x_35696 + x_35697;
                }
                x_35696 = res_35698;
            }
            // write result of operation
            {
                *(__local float *) &mem_38350[local_tid_35605 * 4] = x_35696;
            }
        }
        skip_waves_39104 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    res_35695 = *(__local float *) &mem_38350[0];
    if (local_tid_35605 == 0) {
        *(__global float *) &mem_38353[group_id_35606 * 4] = res_35695;
    }
}
__kernel void map_intra_group_35851(__local volatile
                                    int64_t *mem_38403_backing_aligned_0,
                                    __local volatile
                                    int64_t *mem_38405_backing_aligned_1,
                                    __local volatile
                                    int64_t *mem_38408_backing_aligned_2,
                                    __local volatile
                                    int64_t *mem_38411_backing_aligned_3,
                                    __local volatile
                                    int64_t *mem_38414_backing_aligned_4,
                                    __local volatile
                                    int64_t *mem_38417_backing_aligned_5,
                                    int32_t sizze_31214, int32_t sizze_31215,
                                    int32_t n_31219, int32_t arg_31616, __global
                                    unsigned char *res_mem_38289, __global
                                    unsigned char *res_mem_38290, __global
                                    unsigned char *res_mem_38291, __global
                                    unsigned char *res_mem_38339, __global
                                    unsigned char *res_mem_38340, __global
                                    unsigned char *res_mem_38341, __global
                                    unsigned char *res_mem_38358, __global
                                    unsigned char *mem_38361, __global
                                    unsigned char *mem_38421, __global
                                    unsigned char *mem_38425, __global
                                    unsigned char *mem_38428, __global
                                    unsigned char *mem_38431)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict mem_38403_backing_0 =
                          mem_38403_backing_aligned_0;
    __local volatile char *restrict mem_38405_backing_1 =
                          mem_38405_backing_aligned_1;
    __local volatile char *restrict mem_38408_backing_2 =
                          mem_38408_backing_aligned_2;
    __local volatile char *restrict mem_38411_backing_3 =
                          mem_38411_backing_aligned_3;
    __local volatile char *restrict mem_38414_backing_4 =
                          mem_38414_backing_aligned_4;
    __local volatile char *restrict mem_38417_backing_5 =
                          mem_38417_backing_aligned_5;
    int32_t global_tid_35851;
    int32_t local_tid_35852;
    int32_t group_sizze_39160;
    int32_t wave_sizze_39159;
    int32_t group_id_35853;
    
    global_tid_35851 = get_global_id(0);
    local_tid_35852 = get_local_id(0);
    group_sizze_39160 = get_local_size(0);
    wave_sizze_39159 = LOCKSTEP_WIDTH;
    group_id_35853 = get_group_id(0);
    
    int32_t gtid_35839;
    int32_t ltid_35840;
    
    gtid_35839 = squot32(global_tid_35851, arg_31616);
    ltid_35840 = global_tid_35851 - squot32(global_tid_35851, arg_31616) *
        arg_31616;
    
    int32_t x_36172;
    float x_36173;
    int32_t x_36174;
    float x_36175;
    int32_t copy_p_36178;
    int32_t y_36179;
    float res_36180;
    float res_36181;
    float y_36182;
    bool cond_36192;
    float x_37170;
    
    if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840, arg_31616)) {
        x_36172 = *(__global int32_t *) &res_mem_38340[gtid_35839 * 4];
        x_36173 = *(__global float *) &res_mem_38341[gtid_35839 * 4];
        x_36174 = *(__global int32_t *) &res_mem_38339[gtid_35839 * 4];
        x_36175 = *(__global float *) &res_mem_38358[gtid_35839 * 4];
        copy_p_36178 = *(__global int32_t *) &res_mem_38289[gtid_35839 * 4];
        y_36179 = copy_p_36178 - x_36172;
        res_36180 = sitofp_i32_f32(x_36172);
        res_36181 = futrts_sqrt32(res_36180);
        y_36182 = x_36173 * res_36181;
        cond_36192 = sle32(y_36179, ltid_35840);
        if (cond_36192) {
            x_37170 = 0.0F;
        } else {
            bool cond_36194;
            float res_36195;
            
            cond_36194 = ltid_35840 == 0;
            if (cond_36194) {
                res_36195 = x_36175;
            } else {
                int32_t x_36196;
                int32_t i_36197;
                float negate_arg_36198;
                float x_36199;
                int32_t i_36200;
                float y_36201;
                float res_36202;
                
                x_36196 = x_36172 - x_36174;
                i_36197 = ltid_35840 + x_36196;
                negate_arg_36198 = *(__global
                                     float *) &res_mem_38290[(gtid_35839 *
                                                              sizze_31214 +
                                                              i_36197) * 4];
                x_36199 = 0.0F - negate_arg_36198;
                i_36200 = ltid_35840 + x_36172;
                y_36201 = *(__global float *) &res_mem_38290[(gtid_35839 *
                                                              sizze_31214 +
                                                              i_36200) * 4];
                res_36202 = x_36199 + y_36201;
                res_36195 = res_36202;
            }
            x_37170 = res_36195;
        }
    }
    
    __local char *mem_38403;
    
    mem_38403 = (__local char *) mem_38403_backing_0;
    for (int32_t comb_iter_39161 = 0; comb_iter_39161 < 1; comb_iter_39161++) {
        int32_t ctid_35842;
        int32_t flat_comb_id_39162 = comb_iter_39161 * arg_31616 +
                local_tid_35852;
        
        ctid_35842 = flat_comb_id_39162;
        if (slt32(ctid_35842, arg_31616) && 1) {
            *(__local float *) &mem_38403[ctid_35842 * 4] = x_37170;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    float x_36204;
    float x_36205;
    float x_39163;
    float x_39164;
    int32_t skip_threads_39166;
    
    if (slt32(local_tid_35852, arg_31616)) {
        x_36205 = *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                         sizeof(float)];
    }
    // in-block scan (hopefully no barriers needed)
    {
        skip_threads_39166 = 1;
        while (slt32(skip_threads_39166, 32)) {
            if (sle32(skip_threads_39166, local_tid_35852 -
                      squot32(local_tid_35852, 32) * 32) &&
                slt32(local_tid_35852, arg_31616)) {
                // read operands
                {
                    x_36204 = *(volatile __local
                                float *) &mem_38403[(local_tid_35852 -
                                                     skip_threads_39166) *
                                                    sizeof(float)];
                }
                // perform operation
                {
                    float res_36206;
                    
                    if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840,
                                                                arg_31616)) {
                        res_36206 = x_36204 + x_36205;
                    }
                    x_36205 = res_36206;
                }
            }
            if (sle32(wave_sizze_39159, skip_threads_39166)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            if (sle32(skip_threads_39166, local_tid_35852 -
                      squot32(local_tid_35852, 32) * 32) &&
                slt32(local_tid_35852, arg_31616)) {
                // write result
                {
                    *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                           sizeof(float)] =
                        x_36205;
                }
            }
            if (sle32(wave_sizze_39159, skip_threads_39166)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            skip_threads_39166 *= 2;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // last thread of block 'i' writes its result to offset 'i'
    {
        if ((local_tid_35852 - squot32(local_tid_35852, 32) * 32) == 31 &&
            slt32(local_tid_35852, arg_31616)) {
            *(volatile __local float *) &mem_38403[squot32(local_tid_35852,
                                                           32) *
                                                   sizeof(float)] = x_36205;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
    {
        int32_t skip_threads_39167;
        
        if (squot32(local_tid_35852, 32) == 0 && slt32(local_tid_35852,
                                                       arg_31616)) {
            x_39164 = *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                             sizeof(float)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_39167 = 1;
            while (slt32(skip_threads_39167, 32)) {
                if (sle32(skip_threads_39167, local_tid_35852 -
                          squot32(local_tid_35852, 32) * 32) &&
                    (squot32(local_tid_35852, 32) == 0 && slt32(local_tid_35852,
                                                                arg_31616))) {
                    // read operands
                    {
                        x_39163 = *(volatile __local
                                    float *) &mem_38403[(local_tid_35852 -
                                                         skip_threads_39167) *
                                                        sizeof(float)];
                    }
                    // perform operation
                    {
                        float res_39165;
                        
                        if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840,
                                                                    arg_31616)) {
                            res_39165 = x_39163 + x_39164;
                        }
                        x_39164 = res_39165;
                    }
                }
                if (sle32(wave_sizze_39159, skip_threads_39167)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_39167, local_tid_35852 -
                          squot32(local_tid_35852, 32) * 32) &&
                    (squot32(local_tid_35852, 32) == 0 && slt32(local_tid_35852,
                                                                arg_31616))) {
                    // write result
                    {
                        *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                               sizeof(float)] =
                            x_39164;
                    }
                }
                if (sle32(wave_sizze_39159, skip_threads_39167)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_39167 *= 2;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // carry-in for every block except the first
    {
        if (!(squot32(local_tid_35852, 32) == 0 || !slt32(local_tid_35852,
                                                          arg_31616))) {
            // read operands
            {
                x_36204 = *(volatile __local
                            float *) &mem_38403[(squot32(local_tid_35852, 32) -
                                                 1) * sizeof(float)];
            }
            // perform operation
            {
                float res_36206;
                
                if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840,
                                                            arg_31616)) {
                    res_36206 = x_36204 + x_36205;
                }
                x_36205 = res_36206;
            }
            // write final result
            {
                *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                       sizeof(float)] = x_36205;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // restore correct values for first block
    {
        if (squot32(local_tid_35852, 32) == 0) {
            *(volatile __local float *) &mem_38403[local_tid_35852 *
                                                   sizeof(float)] = x_36205;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t x_36218;
    bool cond_36221;
    float x_37172;
    float x_37174;
    float res_36220;
    bool res_36222;
    bool res_36223;
    bool x_36224;
    float res_36225;
    bool res_36226;
    bool x_36227;
    float res_36228;
    
    if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840, arg_31616)) {
        x_36218 = ltid_35840;
        cond_36221 = slt32(ltid_35840, y_36179);
        x_37172 = *(__local float *) &mem_38403[ltid_35840 * 4];
        x_37174 = *(__global float *) &mem_38361[ltid_35840 * 4];
        res_36220 = x_37172 / y_36182;
        res_36222 = futrts_isnan32(res_36220);
        res_36223 = !res_36222;
        x_36224 = cond_36221 && res_36223;
        res_36225 = (float) fabs(res_36220);
        res_36226 = x_37174 < res_36225;
        x_36227 = x_36224 && res_36226;
        if (cond_36221) {
            res_36228 = res_36220;
        } else {
            res_36228 = 0.0F;
        }
    }
    
    __local char *mem_38405;
    __local char *mem_38408;
    __local char *mem_38411;
    __local char *mem_38414;
    bool acc0_36229;
    int32_t acc0_36230;
    float acc0_36231;
    
    mem_38405 = (__local char *) mem_38405_backing_1;
    mem_38408 = (__local char *) mem_38408_backing_2;
    mem_38411 = (__local char *) mem_38411_backing_3;
    mem_38414 = (__local char *) mem_38414_backing_4;
    for (int32_t comb_iter_39168 = 0; comb_iter_39168 < 1; comb_iter_39168++) {
        int32_t ctid_35846;
        int32_t flat_comb_id_39169 = comb_iter_39168 * arg_31616 +
                local_tid_35852;
        
        ctid_35846 = flat_comb_id_39169;
        if (slt32(ctid_35846, arg_31616) && 1) {
            *(__local bool *) &mem_38405[ctid_35846] = x_36227;
            *(__local int32_t *) &mem_38408[ctid_35846 * 4] = x_36218;
            *(__local float *) &mem_38411[ctid_35846 * 4] = res_36228;
            *(__local float *) &mem_38414[ctid_35846 * 4] = res_36220;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39170;
    int32_t skip_waves_39171;
    bool x_36232;
    int32_t x_36233;
    float x_36234;
    bool x_36235;
    int32_t x_36236;
    float x_36237;
    
    offset_39170 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35852, arg_31616)) {
            x_36232 = *(__local bool *) &mem_38405[local_tid_35852 +
                                                   offset_39170];
            x_36233 = *(__local int32_t *) &mem_38408[(local_tid_35852 +
                                                       offset_39170) * 4];
            x_36234 = *(__local float *) &mem_38411[(local_tid_35852 +
                                                     offset_39170) * 4];
        }
    }
    offset_39170 = 1;
    while (slt32(offset_39170, wave_sizze_39159)) {
        if (slt32(local_tid_35852 + offset_39170, arg_31616) &&
            ((local_tid_35852 - squot32(local_tid_35852, wave_sizze_39159) *
              wave_sizze_39159) & (2 * offset_39170 - 1)) == 0) {
            // read array element
            {
                x_36235 = *(volatile __local
                            bool *) &mem_38405[local_tid_35852 + offset_39170];
                x_36236 = *(volatile __local
                            int32_t *) &mem_38408[(local_tid_35852 +
                                                   offset_39170) * 4];
                x_36237 = *(volatile __local
                            float *) &mem_38411[(local_tid_35852 +
                                                 offset_39170) * 4];
            }
            // apply reduction operation
            {
                bool res_36238;
                int32_t res_36239;
                float res_36244;
                
                if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840,
                                                            arg_31616)) {
                    if (x_36232) {
                        res_36238 = x_36232;
                        res_36239 = x_36233;
                    } else {
                        bool x_36240;
                        bool y_36241;
                        bool res_36242;
                        int32_t res_36243;
                        
                        x_36240 = !x_36235;
                        y_36241 = x_36232 && x_36240;
                        res_36242 = x_36235 || y_36241;
                        if (x_36235) {
                            res_36243 = x_36236;
                        } else {
                            res_36243 = x_36233;
                        }
                        res_36238 = res_36242;
                        res_36239 = res_36243;
                    }
                    res_36244 = x_36234 + x_36237;
                }
                x_36232 = res_36238;
                x_36233 = res_36239;
                x_36234 = res_36244;
            }
            // write result of operation
            {
                *(volatile __local bool *) &mem_38405[local_tid_35852] =
                    x_36232;
                *(volatile __local int32_t *) &mem_38408[local_tid_35852 * 4] =
                    x_36233;
                *(volatile __local float *) &mem_38411[local_tid_35852 * 4] =
                    x_36234;
            }
        }
        offset_39170 *= 2;
    }
    skip_waves_39171 = 1;
    while (slt32(skip_waves_39171, squot32(arg_31616 + wave_sizze_39159 - 1,
                                           wave_sizze_39159))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39170 = skip_waves_39171 * wave_sizze_39159;
        if (slt32(local_tid_35852 + offset_39170, arg_31616) &&
            ((local_tid_35852 - squot32(local_tid_35852, wave_sizze_39159) *
              wave_sizze_39159) == 0 && (squot32(local_tid_35852,
                                                 wave_sizze_39159) & (2 *
                                                                      skip_waves_39171 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_36235 = *(__local bool *) &mem_38405[local_tid_35852 +
                                                       offset_39170];
                x_36236 = *(__local int32_t *) &mem_38408[(local_tid_35852 +
                                                           offset_39170) * 4];
                x_36237 = *(__local float *) &mem_38411[(local_tid_35852 +
                                                         offset_39170) * 4];
            }
            // apply reduction operation
            {
                bool res_36238;
                int32_t res_36239;
                float res_36244;
                
                if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840,
                                                            arg_31616)) {
                    if (x_36232) {
                        res_36238 = x_36232;
                        res_36239 = x_36233;
                    } else {
                        bool x_36240;
                        bool y_36241;
                        bool res_36242;
                        int32_t res_36243;
                        
                        x_36240 = !x_36235;
                        y_36241 = x_36232 && x_36240;
                        res_36242 = x_36235 || y_36241;
                        if (x_36235) {
                            res_36243 = x_36236;
                        } else {
                            res_36243 = x_36233;
                        }
                        res_36238 = res_36242;
                        res_36239 = res_36243;
                    }
                    res_36244 = x_36234 + x_36237;
                }
                x_36232 = res_36238;
                x_36233 = res_36239;
                x_36234 = res_36244;
            }
            // write result of operation
            {
                *(__local bool *) &mem_38405[local_tid_35852] = x_36232;
                *(__local int32_t *) &mem_38408[local_tid_35852 * 4] = x_36233;
                *(__local float *) &mem_38411[local_tid_35852 * 4] = x_36234;
            }
        }
        skip_waves_39171 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    acc0_36229 = *(__local bool *) &mem_38405[0];
    acc0_36230 = *(__local int32_t *) &mem_38408[0];
    acc0_36231 = *(__local float *) &mem_38411[0];
    
    int32_t res_36250;
    bool cond_36257;
    int32_t res_36258;
    bool cond_36264;
    bool res_36265;
    bool x_36266;
    bool y_36267;
    bool cond_36268;
    int32_t res_36269;
    
    if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840, arg_31616)) {
        if (acc0_36229) {
            res_36250 = acc0_36230;
        } else {
            res_36250 = -1;
        }
        cond_36257 = !acc0_36229;
        if (cond_36257) {
            res_36258 = -1;
        } else {
            bool cond_36259;
            int32_t res_36260;
            
            cond_36259 = slt32(res_36250, y_36179);
            if (cond_36259) {
                int32_t i_36261;
                int32_t x_36262;
                int32_t res_36263;
                
                i_36261 = x_36172 + res_36250;
                x_36262 = *(__global int32_t *) &res_mem_38291[(gtid_35839 *
                                                                sizze_31214 +
                                                                i_36261) * 4];
                res_36263 = x_36262 - n_31219;
                res_36260 = res_36263;
            } else {
                res_36260 = -1;
            }
            res_36258 = res_36260;
        }
        cond_36264 = sle32(x_36172, 5);
        res_36265 = sle32(y_36179, 5);
        x_36266 = !cond_36264;
        y_36267 = res_36265 && x_36266;
        cond_36268 = cond_36264 || y_36267;
        if (cond_36268) {
            res_36269 = -2;
        } else {
            res_36269 = res_36258;
        }
    }
    
    __local char *mem_38417;
    
    mem_38417 = (__local char *) mem_38417_backing_5;
    for (int32_t comb_iter_39172 = 0; comb_iter_39172 < 1; comb_iter_39172++) {
        int32_t new_local_index_35847;
        int32_t flat_comb_id_39173 = comb_iter_39172 * arg_31616 +
                local_tid_35852;
        
        new_local_index_35847 = flat_comb_id_39173;
        if (slt32(new_local_index_35847, arg_31616) && 1) {
            *(__local float *) &mem_38417[new_local_index_35847 * 4] = NAN;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    float x_37120;
    int32_t x_37122;
    
    if (slt32(gtid_35839, sizze_31215) && slt32(ltid_35840, arg_31616)) {
        x_37120 = *(__local float *) &mem_38414[ltid_35840 * 4];
        if (cond_36221) {
            int32_t i_36277;
            int32_t x_36278;
            int32_t res_36279;
            
            i_36277 = ltid_35840 + x_36172;
            x_36278 = *(__global int32_t *) &res_mem_38291[(gtid_35839 *
                                                            sizze_31214 +
                                                            i_36277) * 4];
            res_36279 = x_36278 - n_31219;
            x_37122 = res_36279;
        } else {
            x_37122 = -1;
        }
    }
    for (int32_t comb_iter_39174 = 0; comb_iter_39174 < 1; comb_iter_39174++) {
        int32_t ctid_35849;
        int32_t flat_comb_id_39175 = comb_iter_39174 * arg_31616 +
                local_tid_35852;
        
        ctid_35849 = flat_comb_id_39175;
        if (slt32(ctid_35849, arg_31616) && 1) {
            if (sle32(0, x_37122) && slt32(x_37122, arg_31616)) {
                *(__local float *) &mem_38417[x_37122 * 4] = x_37120;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int32_t i_39176 = 0; i_39176 < squot32(arg_31616 - local_tid_35852 +
                                                arg_31616 - 1, arg_31616);
         i_39176++) {
        *(__global float *) &mem_38421[(group_id_35853 * arg_31616 + (i_39176 *
                                                                      arg_31616 +
                                                                      local_tid_35852)) *
                                       4] = *(__local
                                              float *) &mem_38417[(i_39176 *
                                                                   arg_31616 +
                                                                   local_tid_35852) *
                                                                  4];
    }
    for (int32_t i_39177 = 0; i_39177 < squot32(arg_31616 - local_tid_35852 +
                                                arg_31616 - 1, arg_31616);
         i_39177++) {
        *(__global float *) &mem_38425[(group_id_35853 * arg_31616 + (i_39177 *
                                                                      arg_31616 +
                                                                      local_tid_35852)) *
                                       4] = *(__local
                                              float *) &mem_38414[(i_39177 *
                                                                   arg_31616 +
                                                                   local_tid_35852) *
                                                                  4];
    }
    if (local_tid_35852 == 0) {
        *(__global int32_t *) &mem_38428[group_id_35853 * 4] = res_36269;
    }
    if (local_tid_35852 == 0) {
        *(__global float *) &mem_38431[group_id_35853 * 4] = acc0_36231;
    }
}
__kernel void map_transpose_f32(int32_t destoffset_1, int32_t srcoffset_3,
                                int32_t num_arrays_4, int32_t x_elems_5,
                                int32_t y_elems_6, int32_t in_elems_7,
                                int32_t out_elems_8, int32_t mulx_9,
                                int32_t muly_10, __global
                                unsigned char *destmem_0, __global
                                unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 4224);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_global_id_0_37;
    int32_t y_index_32 = get_group_id_1_41 * 32 + get_local_id_1_39;
    
    if (slt32(x_index_31, x_elems_5)) {
        for (int32_t j_43 = 0; j_43 < 4; j_43++) {
            int32_t index_in_35 = (y_index_32 + j_43 * 8) * x_elems_5 +
                    x_index_31;
            
            if (slt32(y_index_32 + j_43 * 8, y_elems_6) && slt32(index_in_35,
                                                                 in_elems_7)) {
                *(__local float *) &block_11[((get_local_id_1_39 + j_43 * 8) *
                                              33 + get_local_id_0_38) *
                                             sizeof(float)] = *(__global
                                                                float *) &srcmem_2[(idata_offset_34 +
                                                                                    index_in_35) *
                                                                                   sizeof(float)];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 32 + get_local_id_0_38;
    y_index_32 = get_group_id_0_40 * 32 + get_local_id_1_39;
    if (slt32(x_index_31, y_elems_6)) {
        for (int32_t j_43 = 0; j_43 < 4; j_43++) {
            int32_t index_out_36 = (y_index_32 + j_43 * 8) * y_elems_6 +
                    x_index_31;
            
            if (slt32(y_index_32 + j_43 * 8, x_elems_5) && slt32(index_out_36,
                                                                 out_elems_8)) {
                *(__global float *) &destmem_0[(odata_offset_33 +
                                                index_out_36) * sizeof(float)] =
                    *(__local float *) &block_11[(get_local_id_0_38 * 33 +
                                                  get_local_id_1_39 + j_43 *
                                                  8) * sizeof(float)];
            }
        }
    }
}
__kernel void map_transpose_f32_low_height(int32_t destoffset_1,
                                           int32_t srcoffset_3,
                                           int32_t num_arrays_4,
                                           int32_t x_elems_5, int32_t y_elems_6,
                                           int32_t in_elems_7,
                                           int32_t out_elems_8, int32_t mulx_9,
                                           int32_t muly_10, __global
                                           unsigned char *destmem_0, __global
                                           unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1088);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_0_38 +
            srem32(get_local_id_1_39, mulx_9) * 16;
    int32_t y_index_32 = get_group_id_1_41 * 16 + squot32(get_local_id_1_39,
                                                          mulx_9);
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    
    if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
                                         slt32(index_in_35, in_elems_7))) {
        *(__local float *) &block_11[(get_local_id_1_39 * 17 +
                                      get_local_id_0_38) * sizeof(float)] =
            *(__global float *) &srcmem_2[(idata_offset_34 + index_in_35) *
                                          sizeof(float)];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 16 + squot32(get_local_id_0_38, mulx_9);
    y_index_32 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_1_39 +
        srem32(get_local_id_0_38, mulx_9) * 16;
    
    int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
    
    if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
                                         slt32(index_out_36, out_elems_8))) {
        *(__global float *) &destmem_0[(odata_offset_33 + index_out_36) *
                                       sizeof(float)] = *(__local
                                                          float *) &block_11[(get_local_id_0_38 *
                                                                              17 +
                                                                              get_local_id_1_39) *
                                                                             sizeof(float)];
    }
}
__kernel void map_transpose_f32_low_width(int32_t destoffset_1,
                                          int32_t srcoffset_3,
                                          int32_t num_arrays_4,
                                          int32_t x_elems_5, int32_t y_elems_6,
                                          int32_t in_elems_7,
                                          int32_t out_elems_8, int32_t mulx_9,
                                          int32_t muly_10, __global
                                          unsigned char *destmem_0, __global
                                          unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1088);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_group_id_0_40 * 16 + squot32(get_local_id_0_38,
                                                          muly_10);
    int32_t y_index_32 = get_group_id_1_41 * 16 * muly_10 + get_local_id_1_39 +
            srem32(get_local_id_0_38, muly_10) * 16;
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    
    if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
                                         slt32(index_in_35, in_elems_7))) {
        *(__local float *) &block_11[(get_local_id_1_39 * 17 +
                                      get_local_id_0_38) * sizeof(float)] =
            *(__global float *) &srcmem_2[(idata_offset_34 + index_in_35) *
                                          sizeof(float)];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 16 * muly_10 + get_local_id_0_38 +
        srem32(get_local_id_1_39, muly_10) * 16;
    y_index_32 = get_group_id_0_40 * 16 + squot32(get_local_id_1_39, muly_10);
    
    int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
    
    if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
                                         slt32(index_out_36, out_elems_8))) {
        *(__global float *) &destmem_0[(odata_offset_33 + index_out_36) *
                                       sizeof(float)] = *(__local
                                                          float *) &block_11[(get_local_id_0_38 *
                                                                              17 +
                                                                              get_local_id_1_39) *
                                                                             sizeof(float)];
    }
}
__kernel void map_transpose_f32_small(int32_t destoffset_1, int32_t srcoffset_3,
                                      int32_t num_arrays_4, int32_t x_elems_5,
                                      int32_t y_elems_6, int32_t in_elems_7,
                                      int32_t out_elems_8, int32_t mulx_9,
                                      int32_t muly_10, __global
                                      unsigned char *destmem_0, __global
                                      unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = squot32(get_global_id_0_37, y_elems_6 *
                                          x_elems_5) * (y_elems_6 * x_elems_5);
    int32_t x_index_31 = squot32(srem32(get_global_id_0_37, y_elems_6 *
                                        x_elems_5), y_elems_6);
    int32_t y_index_32 = srem32(get_global_id_0_37, y_elems_6);
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    int32_t index_out_36 = x_index_31 * y_elems_6 + y_index_32;
    
    if (slt32(get_global_id_0_37, in_elems_7)) {
        *(__global float *) &destmem_0[(odata_offset_33 + index_out_36) *
                                       sizeof(float)] = *(__global
                                                          float *) &srcmem_2[(idata_offset_34 +
                                                                              index_in_35) *
                                                                             sizeof(float)];
    }
}
__kernel void map_transpose_i32(int32_t destoffset_1, int32_t srcoffset_3,
                                int32_t num_arrays_4, int32_t x_elems_5,
                                int32_t y_elems_6, int32_t in_elems_7,
                                int32_t out_elems_8, int32_t mulx_9,
                                int32_t muly_10, __global
                                unsigned char *destmem_0, __global
                                unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 4224);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_global_id_0_37;
    int32_t y_index_32 = get_group_id_1_41 * 32 + get_local_id_1_39;
    
    if (slt32(x_index_31, x_elems_5)) {
        for (int32_t j_43 = 0; j_43 < 4; j_43++) {
            int32_t index_in_35 = (y_index_32 + j_43 * 8) * x_elems_5 +
                    x_index_31;
            
            if (slt32(y_index_32 + j_43 * 8, y_elems_6) && slt32(index_in_35,
                                                                 in_elems_7)) {
                *(__local int32_t *) &block_11[((get_local_id_1_39 + j_43 * 8) *
                                                33 + get_local_id_0_38) *
                                               sizeof(int32_t)] = *(__global
                                                                    int32_t *) &srcmem_2[(idata_offset_34 +
                                                                                          index_in_35) *
                                                                                         sizeof(int32_t)];
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 32 + get_local_id_0_38;
    y_index_32 = get_group_id_0_40 * 32 + get_local_id_1_39;
    if (slt32(x_index_31, y_elems_6)) {
        for (int32_t j_43 = 0; j_43 < 4; j_43++) {
            int32_t index_out_36 = (y_index_32 + j_43 * 8) * y_elems_6 +
                    x_index_31;
            
            if (slt32(y_index_32 + j_43 * 8, x_elems_5) && slt32(index_out_36,
                                                                 out_elems_8)) {
                *(__global int32_t *) &destmem_0[(odata_offset_33 +
                                                  index_out_36) *
                                                 sizeof(int32_t)] = *(__local
                                                                      int32_t *) &block_11[(get_local_id_0_38 *
                                                                                            33 +
                                                                                            get_local_id_1_39 +
                                                                                            j_43 *
                                                                                            8) *
                                                                                           sizeof(int32_t)];
            }
        }
    }
}
__kernel void map_transpose_i32_low_height(int32_t destoffset_1,
                                           int32_t srcoffset_3,
                                           int32_t num_arrays_4,
                                           int32_t x_elems_5, int32_t y_elems_6,
                                           int32_t in_elems_7,
                                           int32_t out_elems_8, int32_t mulx_9,
                                           int32_t muly_10, __global
                                           unsigned char *destmem_0, __global
                                           unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1088);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_0_38 +
            srem32(get_local_id_1_39, mulx_9) * 16;
    int32_t y_index_32 = get_group_id_1_41 * 16 + squot32(get_local_id_1_39,
                                                          mulx_9);
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    
    if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
                                         slt32(index_in_35, in_elems_7))) {
        *(__local int32_t *) &block_11[(get_local_id_1_39 * 17 +
                                        get_local_id_0_38) * sizeof(int32_t)] =
            *(__global int32_t *) &srcmem_2[(idata_offset_34 + index_in_35) *
                                            sizeof(int32_t)];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 16 + squot32(get_local_id_0_38, mulx_9);
    y_index_32 = get_group_id_0_40 * 16 * mulx_9 + get_local_id_1_39 +
        srem32(get_local_id_0_38, mulx_9) * 16;
    
    int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
    
    if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
                                         slt32(index_out_36, out_elems_8))) {
        *(__global int32_t *) &destmem_0[(odata_offset_33 + index_out_36) *
                                         sizeof(int32_t)] = *(__local
                                                              int32_t *) &block_11[(get_local_id_0_38 *
                                                                                    17 +
                                                                                    get_local_id_1_39) *
                                                                                   sizeof(int32_t)];
    }
}
__kernel void map_transpose_i32_low_width(int32_t destoffset_1,
                                          int32_t srcoffset_3,
                                          int32_t num_arrays_4,
                                          int32_t x_elems_5, int32_t y_elems_6,
                                          int32_t in_elems_7,
                                          int32_t out_elems_8, int32_t mulx_9,
                                          int32_t muly_10, __global
                                          unsigned char *destmem_0, __global
                                          unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1088);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = get_group_id_2_42 * x_elems_5 * y_elems_6;
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t x_index_31 = get_group_id_0_40 * 16 + squot32(get_local_id_0_38,
                                                          muly_10);
    int32_t y_index_32 = get_group_id_1_41 * 16 * muly_10 + get_local_id_1_39 +
            srem32(get_local_id_0_38, muly_10) * 16;
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    
    if (slt32(x_index_31, x_elems_5) && (slt32(y_index_32, y_elems_6) &&
                                         slt32(index_in_35, in_elems_7))) {
        *(__local int32_t *) &block_11[(get_local_id_1_39 * 17 +
                                        get_local_id_0_38) * sizeof(int32_t)] =
            *(__global int32_t *) &srcmem_2[(idata_offset_34 + index_in_35) *
                                            sizeof(int32_t)];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    x_index_31 = get_group_id_1_41 * 16 * muly_10 + get_local_id_0_38 +
        srem32(get_local_id_1_39, muly_10) * 16;
    y_index_32 = get_group_id_0_40 * 16 + squot32(get_local_id_1_39, muly_10);
    
    int32_t index_out_36 = y_index_32 * y_elems_6 + x_index_31;
    
    if (slt32(x_index_31, y_elems_6) && (slt32(y_index_32, x_elems_5) &&
                                         slt32(index_out_36, out_elems_8))) {
        *(__global int32_t *) &destmem_0[(odata_offset_33 + index_out_36) *
                                         sizeof(int32_t)] = *(__local
                                                              int32_t *) &block_11[(get_local_id_0_38 *
                                                                                    17 +
                                                                                    get_local_id_1_39) *
                                                                                   sizeof(int32_t)];
    }
}
__kernel void map_transpose_i32_small(int32_t destoffset_1, int32_t srcoffset_3,
                                      int32_t num_arrays_4, int32_t x_elems_5,
                                      int32_t y_elems_6, int32_t in_elems_7,
                                      int32_t out_elems_8, int32_t mulx_9,
                                      int32_t muly_10, __global
                                      unsigned char *destmem_0, __global
                                      unsigned char *srcmem_2)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(block_11_backing_0, 1);
    
    __local char *block_11;
    
    block_11 = (__local char *) block_11_backing_0;
    
    int32_t get_global_id_0_37;
    
    get_global_id_0_37 = get_global_id(0);
    
    int32_t get_local_id_0_38;
    
    get_local_id_0_38 = get_local_id(0);
    
    int32_t get_local_id_1_39;
    
    get_local_id_1_39 = get_local_id(1);
    
    int32_t get_group_id_0_40;
    
    get_group_id_0_40 = get_group_id(0);
    
    int32_t get_group_id_1_41;
    
    get_group_id_1_41 = get_group_id(1);
    
    int32_t get_group_id_2_42;
    
    get_group_id_2_42 = get_group_id(2);
    
    int32_t our_array_offset_30 = squot32(get_global_id_0_37, y_elems_6 *
                                          x_elems_5) * (y_elems_6 * x_elems_5);
    int32_t x_index_31 = squot32(srem32(get_global_id_0_37, y_elems_6 *
                                        x_elems_5), y_elems_6);
    int32_t y_index_32 = srem32(get_global_id_0_37, y_elems_6);
    int32_t odata_offset_33 = squot32(destoffset_1, 4) + our_array_offset_30;
    int32_t idata_offset_34 = squot32(srcoffset_3, 4) + our_array_offset_30;
    int32_t index_in_35 = y_index_32 * x_elems_5 + x_index_31;
    int32_t index_out_36 = x_index_31 * y_elems_6 + y_index_32;
    
    if (slt32(get_global_id_0_37, in_elems_7)) {
        *(__global int32_t *) &destmem_0[(odata_offset_33 + index_out_36) *
                                         sizeof(int32_t)] = *(__global
                                                              int32_t *) &srcmem_2[(idata_offset_34 +
                                                                                    index_in_35) *
                                                                                   sizeof(int32_t)];
    }
}
__kernel void replicate_38973(int32_t sizze_31214, int32_t sizze_31215, __global
                              unsigned char *mem_38281)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t replicate_gtid_38973;
    int32_t replicate_ltid_38974;
    int32_t replicate_gid_38975;
    
    replicate_gtid_38973 = get_global_id(0);
    replicate_ltid_38974 = get_local_id(0);
    replicate_gid_38975 = get_group_id(0);
    if (slt32(replicate_gtid_38973, sizze_31215 * sizze_31214)) {
        *(__global float *) &mem_38281[(squot32(replicate_gtid_38973,
                                                sizze_31214) * sizze_31214 +
                                        (replicate_gtid_38973 -
                                         squot32(replicate_gtid_38973,
                                                 sizze_31214) * sizze_31214)) *
                                       4] = NAN;
    }
}
__kernel void replicate_38978(int32_t sizze_31214, int32_t sizze_31215, __global
                              unsigned char *mem_38285)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t replicate_gtid_38978;
    int32_t replicate_ltid_38979;
    int32_t replicate_gid_38980;
    
    replicate_gtid_38978 = get_global_id(0);
    replicate_ltid_38979 = get_local_id(0);
    replicate_gid_38980 = get_group_id(0);
    if (slt32(replicate_gtid_38978, sizze_31215 * sizze_31214)) {
        *(__global int32_t *) &mem_38285[(squot32(replicate_gtid_38978,
                                                  sizze_31214) * sizze_31214 +
                                          (replicate_gtid_38978 -
                                           squot32(replicate_gtid_38978,
                                                   sizze_31214) *
                                           sizze_31214)) * 4] = 0;
    }
}
__kernel void replicate_39296(int32_t sizze_31215, int32_t arg_31616, __global
                              unsigned char *mem_38466)
{
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t replicate_gtid_39296;
    int32_t replicate_ltid_39297;
    int32_t replicate_gid_39298;
    
    replicate_gtid_39296 = get_global_id(0);
    replicate_ltid_39297 = get_local_id(0);
    replicate_gid_39298 = get_group_id(0);
    if (slt32(replicate_gtid_39296, sizze_31215 * arg_31616)) {
        *(__global float *) &mem_38466[(squot32(replicate_gtid_39296,
                                                arg_31616) * arg_31616 +
                                        (replicate_gtid_39296 -
                                         squot32(replicate_gtid_39296,
                                                 arg_31616) * arg_31616)) * 4] =
            NAN;
    }
}
__kernel void scan_stage1_35145(int32_t sizze_31214, int32_t sizze_31215,
                                int32_t sizze_31216, int32_t num_groups_35162,
                                __global unsigned char *images_mem_37894,
                                __global unsigned char *res_mem_38199, __global
                                unsigned char *mem_38270, __global
                                unsigned char *mem_38274)
{
    const int32_t group_sizze_35152 = mainzigroup_sizze_35127;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(scan_arr_mem_38932_backing_0, 4 *
                         mainzigroup_sizze_35127);
    
    int32_t global_tid_35145;
    int32_t local_tid_35146;
    int32_t group_sizze_38928;
    int32_t wave_sizze_38927;
    int32_t group_id_35147;
    
    global_tid_35145 = get_global_id(0);
    local_tid_35146 = get_local_id(0);
    group_sizze_38928 = get_local_size(0);
    wave_sizze_38927 = LOCKSTEP_WIDTH;
    group_id_35147 = get_group_id(0);
    
    int32_t gtid_35122;
    int32_t gtid_35144;
    __local char *scan_arr_mem_38932;
    
    scan_arr_mem_38932 = (__local char *) scan_arr_mem_38932_backing_0;
    
    int32_t x_35169;
    int32_t x_35170;
    
    x_35169 = 0;
    for (int32_t j_38934 = 0; j_38934 < squot32(sizze_31215 * sizze_31214 +
                                                group_sizze_35152 *
                                                num_groups_35162 - 1,
                                                group_sizze_35152 *
                                                num_groups_35162); j_38934++) {
        int32_t chunk_offset_38935 = group_sizze_35152 * j_38934 +
                group_id_35147 * (group_sizze_35152 * squot32(sizze_31215 *
                                                              sizze_31214 +
                                                              group_sizze_35152 *
                                                              num_groups_35162 -
                                                              1,
                                                              group_sizze_35152 *
                                                              num_groups_35162));
        int32_t flat_idx_38936 = chunk_offset_38935 + local_tid_35146;
        
        gtid_35122 = squot32(flat_idx_38936, sizze_31214);
        gtid_35144 = flat_idx_38936 - squot32(flat_idx_38936, sizze_31214) *
            sizze_31214;
        // threads in bounds read input; others get neutral element
        {
            if (slt32(gtid_35122, sizze_31215) && slt32(gtid_35144,
                                                        sizze_31214)) {
                float x_35174;
                float x_35175;
                bool res_35176;
                bool cond_35177;
                float res_35178;
                bool res_35180;
                bool res_35181;
                int32_t res_35182;
                
                x_35174 = *(__global float *) &images_mem_37894[(gtid_35122 *
                                                                 sizze_31216 +
                                                                 gtid_35144) *
                                                                4];
                x_35175 = *(__global float *) &res_mem_38199[(gtid_35122 *
                                                              sizze_31214 +
                                                              gtid_35144) * 4];
                res_35176 = futrts_isnan32(x_35174);
                cond_35177 = !res_35176;
                if (cond_35177) {
                    float res_35179 = x_35174 - x_35175;
                    
                    res_35178 = res_35179;
                } else {
                    res_35178 = NAN;
                }
                res_35180 = futrts_isnan32(res_35178);
                res_35181 = !res_35180;
                if (res_35181) {
                    res_35182 = 1;
                } else {
                    res_35182 = 0;
                }
                // write to-scan values to parameters
                {
                    x_35170 = res_35182;
                }
                // write mapped values results to global memory
                {
                    *(__global float *) &mem_38274[(gtid_35122 * sizze_31214 +
                                                    gtid_35144) * 4] =
                        res_35178;
                }
            } else {
                x_35170 = 0;
            }
        }
        // combine with carry and write to local memory
        {
            int32_t res_35171 = x_35169 + x_35170;
            
            *(__local int32_t *) &scan_arr_mem_38932[local_tid_35146 * 4] =
                res_35171;
        }
        
        int32_t x_38929;
        int32_t x_38930;
        int32_t x_38937;
        int32_t x_38938;
        int32_t skip_threads_38940;
        
        if (slt32(local_tid_35146, group_sizze_35152)) {
            x_38930 = *(volatile __local
                        int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                       sizeof(int32_t)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_38940 = 1;
            while (slt32(skip_threads_38940, 32)) {
                if (sle32(skip_threads_38940, local_tid_35146 -
                          squot32(local_tid_35146, 32) * 32) &&
                    slt32(local_tid_35146, group_sizze_35152)) {
                    // read operands
                    {
                        x_38929 = *(volatile __local
                                    int32_t *) &scan_arr_mem_38932[(local_tid_35146 -
                                                                    skip_threads_38940) *
                                                                   sizeof(int32_t)];
                    }
                    // perform operation
                    {
                        if (!slt32(srem32(local_tid_35146 + chunk_offset_38935,
                                          sizze_31214), local_tid_35146 +
                                   chunk_offset_38935 - (local_tid_35146 -
                                                         skip_threads_38940 +
                                                         chunk_offset_38935))) {
                            int32_t res_38931 = x_38929 + x_38930;
                            
                            x_38930 = res_38931;
                        }
                    }
                }
                if (sle32(wave_sizze_38927, skip_threads_38940)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_38940, local_tid_35146 -
                          squot32(local_tid_35146, 32) * 32) &&
                    slt32(local_tid_35146, group_sizze_35152)) {
                    // write result
                    {
                        *(volatile __local
                          int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                         sizeof(int32_t)] =
                            x_38930;
                    }
                }
                if (sle32(wave_sizze_38927, skip_threads_38940)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_38940 *= 2;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // last thread of block 'i' writes its result to offset 'i'
        {
            if ((local_tid_35146 - squot32(local_tid_35146, 32) * 32) == 31 &&
                slt32(local_tid_35146, group_sizze_35152)) {
                *(volatile __local
                  int32_t *) &scan_arr_mem_38932[squot32(local_tid_35146, 32) *
                                                 sizeof(int32_t)] = x_38930;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
        {
            int32_t skip_threads_38941;
            
            if (squot32(local_tid_35146, 32) == 0 && slt32(local_tid_35146,
                                                           group_sizze_35152)) {
                x_38938 = *(volatile __local
                            int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                           sizeof(int32_t)];
            }
            // in-block scan (hopefully no barriers needed)
            {
                skip_threads_38941 = 1;
                while (slt32(skip_threads_38941, 32)) {
                    if (sle32(skip_threads_38941, local_tid_35146 -
                              squot32(local_tid_35146, 32) * 32) &&
                        (squot32(local_tid_35146, 32) == 0 &&
                         slt32(local_tid_35146, group_sizze_35152))) {
                        // read operands
                        {
                            x_38937 = *(volatile __local
                                        int32_t *) &scan_arr_mem_38932[(local_tid_35146 -
                                                                        skip_threads_38941) *
                                                                       sizeof(int32_t)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_35146 * 32 + 32 - 1 +
                                              chunk_offset_38935, sizze_31214),
                                       local_tid_35146 * 32 + 32 - 1 +
                                       chunk_offset_38935 - ((local_tid_35146 -
                                                              skip_threads_38941) *
                                                             32 + 32 - 1 +
                                                             chunk_offset_38935))) {
                                int32_t res_38939 = x_38937 + x_38938;
                                
                                x_38938 = res_38939;
                            }
                        }
                    }
                    if (sle32(wave_sizze_38927, skip_threads_38941)) {
                        barrier(CLK_LOCAL_MEM_FENCE);
                    }
                    if (sle32(skip_threads_38941, local_tid_35146 -
                              squot32(local_tid_35146, 32) * 32) &&
                        (squot32(local_tid_35146, 32) == 0 &&
                         slt32(local_tid_35146, group_sizze_35152))) {
                        // write result
                        {
                            *(volatile __local
                              int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                             sizeof(int32_t)] =
                                x_38938;
                        }
                    }
                    if (sle32(wave_sizze_38927, skip_threads_38941)) {
                        barrier(CLK_LOCAL_MEM_FENCE);
                    }
                    skip_threads_38941 *= 2;
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // carry-in for every block except the first
        {
            if (!(squot32(local_tid_35146, 32) == 0 || !slt32(local_tid_35146,
                                                              group_sizze_35152))) {
                // read operands
                {
                    x_38929 = *(volatile __local
                                int32_t *) &scan_arr_mem_38932[(squot32(local_tid_35146,
                                                                        32) -
                                                                1) *
                                                               sizeof(int32_t)];
                }
                // perform operation
                {
                    if (!slt32(srem32(local_tid_35146 + chunk_offset_38935,
                                      sizze_31214), local_tid_35146 +
                               chunk_offset_38935 - (squot32(local_tid_35146,
                                                             32) * 32 - 1 +
                                                     chunk_offset_38935))) {
                        int32_t res_38931 = x_38929 + x_38930;
                        
                        x_38930 = res_38931;
                    }
                }
                // write final result
                {
                    *(volatile __local
                      int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                     sizeof(int32_t)] = x_38930;
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // restore correct values for first block
        {
            if (squot32(local_tid_35146, 32) == 0) {
                *(volatile __local
                  int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                 sizeof(int32_t)] = x_38930;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // threads in bounds write partial scan result
        {
            if (slt32(gtid_35122, sizze_31215) && slt32(gtid_35144,
                                                        sizze_31214)) {
                *(__global int32_t *) &mem_38270[(gtid_35122 * sizze_31214 +
                                                  gtid_35144) * 4] = *(__local
                                                                       int32_t *) &scan_arr_mem_38932[local_tid_35146 *
                                                                                                      4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // first thread reads last element as carry-in for next iteration
        {
            if (local_tid_35146 == 0) {
                if (slt32(srem32(chunk_offset_38935 + group_sizze_35152,
                                 sizze_31214), chunk_offset_38935 +
                          group_sizze_35152 - (chunk_offset_38935 +
                                               group_sizze_35152 - 1))) {
                    x_35169 = 0;
                } else {
                    x_35169 = *(__local
                                int32_t *) &scan_arr_mem_38932[(group_sizze_35152 -
                                                                1) * 4];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void scan_stage1_36464(int32_t sizze_31214, int32_t sizze_31215,
                                int32_t arg_31616, int32_t num_groups_36538,
                                __global unsigned char *res_mem_38290, __global
                                unsigned char *res_mem_38339, __global
                                unsigned char *res_mem_38340, __global
                                unsigned char *res_mem_38358, __global
                                unsigned char *mem_38437, __global
                                unsigned char *mem_38441)
{
    const int32_t group_sizze_36528 = mainzigroup_sizze_36446;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(scan_arr_mem_39187_backing_0, 4 *
                         mainzigroup_sizze_36446);
    
    int32_t global_tid_36464;
    int32_t local_tid_36465;
    int32_t group_sizze_39183;
    int32_t wave_sizze_39182;
    int32_t group_id_36466;
    
    global_tid_36464 = get_global_id(0);
    local_tid_36465 = get_local_id(0);
    group_sizze_39183 = get_local_size(0);
    wave_sizze_39182 = LOCKSTEP_WIDTH;
    group_id_36466 = get_group_id(0);
    
    int32_t gtid_36442;
    int32_t gtid_36463;
    __local char *scan_arr_mem_39187;
    
    scan_arr_mem_39187 = (__local char *) scan_arr_mem_39187_backing_0;
    
    float x_36544;
    float x_36545;
    
    x_36544 = 0.0F;
    for (int32_t j_39189 = 0; j_39189 < squot32(sizze_31215 * arg_31616 +
                                                group_sizze_36528 *
                                                num_groups_36538 - 1,
                                                group_sizze_36528 *
                                                num_groups_36538); j_39189++) {
        int32_t chunk_offset_39190 = group_sizze_36528 * j_39189 +
                group_id_36466 * (group_sizze_36528 * squot32(sizze_31215 *
                                                              arg_31616 +
                                                              group_sizze_36528 *
                                                              num_groups_36538 -
                                                              1,
                                                              group_sizze_36528 *
                                                              num_groups_36538));
        int32_t flat_idx_39191 = chunk_offset_39190 + local_tid_36465;
        
        gtid_36442 = squot32(flat_idx_39191, arg_31616);
        gtid_36463 = flat_idx_39191 - squot32(flat_idx_39191, arg_31616) *
            arg_31616;
        // threads in bounds read input; others get neutral element
        {
            if (slt32(gtid_36442, sizze_31215) && slt32(gtid_36463,
                                                        arg_31616)) {
                int32_t x_36547;
                int32_t x_36548;
                float x_36549;
                int32_t y_36551;
                bool cond_36554;
                float res_36555;
                
                x_36547 = *(__global int32_t *) &res_mem_38340[gtid_36442 * 4];
                x_36548 = *(__global int32_t *) &res_mem_38339[gtid_36442 * 4];
                x_36549 = *(__global float *) &res_mem_38358[gtid_36442 * 4];
                y_36551 = *(__global int32_t *) &mem_38437[gtid_36442 * 4];
                cond_36554 = sle32(y_36551, gtid_36463);
                if (cond_36554) {
                    res_36555 = 0.0F;
                } else {
                    bool cond_36556;
                    float res_36557;
                    
                    cond_36556 = gtid_36463 == 0;
                    if (cond_36556) {
                        res_36557 = x_36549;
                    } else {
                        int32_t x_36558;
                        int32_t i_36559;
                        float negate_arg_36560;
                        float x_36561;
                        int32_t i_36562;
                        float y_36563;
                        float res_36564;
                        
                        x_36558 = x_36547 - x_36548;
                        i_36559 = gtid_36463 + x_36558;
                        negate_arg_36560 = *(__global
                                             float *) &res_mem_38290[(gtid_36442 *
                                                                      sizze_31214 +
                                                                      i_36559) *
                                                                     4];
                        x_36561 = 0.0F - negate_arg_36560;
                        i_36562 = gtid_36463 + x_36547;
                        y_36563 = *(__global
                                    float *) &res_mem_38290[(gtid_36442 *
                                                             sizze_31214 +
                                                             i_36562) * 4];
                        res_36564 = x_36561 + y_36563;
                        res_36557 = res_36564;
                    }
                    res_36555 = res_36557;
                }
                // write to-scan values to parameters
                {
                    x_36545 = res_36555;
                }
                // write mapped values results to global memory
                { }
            } else {
                x_36545 = 0.0F;
            }
        }
        // combine with carry and write to local memory
        {
            float res_36546 = x_36544 + x_36545;
            
            *(__local float *) &scan_arr_mem_39187[local_tid_36465 * 4] =
                res_36546;
        }
        
        float x_39184;
        float x_39185;
        float x_39192;
        float x_39193;
        int32_t skip_threads_39195;
        
        if (slt32(local_tid_36465, group_sizze_36528)) {
            x_39185 = *(volatile __local
                        float *) &scan_arr_mem_39187[local_tid_36465 *
                                                     sizeof(float)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_39195 = 1;
            while (slt32(skip_threads_39195, 32)) {
                if (sle32(skip_threads_39195, local_tid_36465 -
                          squot32(local_tid_36465, 32) * 32) &&
                    slt32(local_tid_36465, group_sizze_36528)) {
                    // read operands
                    {
                        x_39184 = *(volatile __local
                                    float *) &scan_arr_mem_39187[(local_tid_36465 -
                                                                  skip_threads_39195) *
                                                                 sizeof(float)];
                    }
                    // perform operation
                    {
                        if (!slt32(srem32(local_tid_36465 + chunk_offset_39190,
                                          arg_31616), local_tid_36465 +
                                   chunk_offset_39190 - (local_tid_36465 -
                                                         skip_threads_39195 +
                                                         chunk_offset_39190))) {
                            float res_39186 = x_39184 + x_39185;
                            
                            x_39185 = res_39186;
                        }
                    }
                }
                if (sle32(wave_sizze_39182, skip_threads_39195)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_39195, local_tid_36465 -
                          squot32(local_tid_36465, 32) * 32) &&
                    slt32(local_tid_36465, group_sizze_36528)) {
                    // write result
                    {
                        *(volatile __local
                          float *) &scan_arr_mem_39187[local_tid_36465 *
                                                       sizeof(float)] = x_39185;
                    }
                }
                if (sle32(wave_sizze_39182, skip_threads_39195)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_39195 *= 2;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // last thread of block 'i' writes its result to offset 'i'
        {
            if ((local_tid_36465 - squot32(local_tid_36465, 32) * 32) == 31 &&
                slt32(local_tid_36465, group_sizze_36528)) {
                *(volatile __local
                  float *) &scan_arr_mem_39187[squot32(local_tid_36465, 32) *
                                               sizeof(float)] = x_39185;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
        {
            int32_t skip_threads_39196;
            
            if (squot32(local_tid_36465, 32) == 0 && slt32(local_tid_36465,
                                                           group_sizze_36528)) {
                x_39193 = *(volatile __local
                            float *) &scan_arr_mem_39187[local_tid_36465 *
                                                         sizeof(float)];
            }
            // in-block scan (hopefully no barriers needed)
            {
                skip_threads_39196 = 1;
                while (slt32(skip_threads_39196, 32)) {
                    if (sle32(skip_threads_39196, local_tid_36465 -
                              squot32(local_tid_36465, 32) * 32) &&
                        (squot32(local_tid_36465, 32) == 0 &&
                         slt32(local_tid_36465, group_sizze_36528))) {
                        // read operands
                        {
                            x_39192 = *(volatile __local
                                        float *) &scan_arr_mem_39187[(local_tid_36465 -
                                                                      skip_threads_39196) *
                                                                     sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_36465 * 32 + 32 - 1 +
                                              chunk_offset_39190, arg_31616),
                                       local_tid_36465 * 32 + 32 - 1 +
                                       chunk_offset_39190 - ((local_tid_36465 -
                                                              skip_threads_39196) *
                                                             32 + 32 - 1 +
                                                             chunk_offset_39190))) {
                                float res_39194 = x_39192 + x_39193;
                                
                                x_39193 = res_39194;
                            }
                        }
                    }
                    if (sle32(wave_sizze_39182, skip_threads_39196)) {
                        barrier(CLK_LOCAL_MEM_FENCE);
                    }
                    if (sle32(skip_threads_39196, local_tid_36465 -
                              squot32(local_tid_36465, 32) * 32) &&
                        (squot32(local_tid_36465, 32) == 0 &&
                         slt32(local_tid_36465, group_sizze_36528))) {
                        // write result
                        {
                            *(volatile __local
                              float *) &scan_arr_mem_39187[local_tid_36465 *
                                                           sizeof(float)] =
                                x_39193;
                        }
                    }
                    if (sle32(wave_sizze_39182, skip_threads_39196)) {
                        barrier(CLK_LOCAL_MEM_FENCE);
                    }
                    skip_threads_39196 *= 2;
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // carry-in for every block except the first
        {
            if (!(squot32(local_tid_36465, 32) == 0 || !slt32(local_tid_36465,
                                                              group_sizze_36528))) {
                // read operands
                {
                    x_39184 = *(volatile __local
                                float *) &scan_arr_mem_39187[(squot32(local_tid_36465,
                                                                      32) - 1) *
                                                             sizeof(float)];
                }
                // perform operation
                {
                    if (!slt32(srem32(local_tid_36465 + chunk_offset_39190,
                                      arg_31616), local_tid_36465 +
                               chunk_offset_39190 - (squot32(local_tid_36465,
                                                             32) * 32 - 1 +
                                                     chunk_offset_39190))) {
                        float res_39186 = x_39184 + x_39185;
                        
                        x_39185 = res_39186;
                    }
                }
                // write final result
                {
                    *(volatile __local
                      float *) &scan_arr_mem_39187[local_tid_36465 *
                                                   sizeof(float)] = x_39185;
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // restore correct values for first block
        {
            if (squot32(local_tid_36465, 32) == 0) {
                *(volatile __local
                  float *) &scan_arr_mem_39187[local_tid_36465 *
                                               sizeof(float)] = x_39185;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // threads in bounds write partial scan result
        {
            if (slt32(gtid_36442, sizze_31215) && slt32(gtid_36463,
                                                        arg_31616)) {
                *(__global float *) &mem_38441[(gtid_36442 * arg_31616 +
                                                gtid_36463) * 4] = *(__local
                                                                     float *) &scan_arr_mem_39187[local_tid_36465 *
                                                                                                  4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // first thread reads last element as carry-in for next iteration
        {
            if (local_tid_36465 == 0) {
                if (slt32(srem32(chunk_offset_39190 + group_sizze_36528,
                                 arg_31616), chunk_offset_39190 +
                          group_sizze_36528 - (chunk_offset_39190 +
                                               group_sizze_36528 - 1))) {
                    x_36544 = 0.0F;
                } else {
                    x_36544 = *(__local
                                float *) &scan_arr_mem_39187[(group_sizze_36528 -
                                                              1) * 4];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void scan_stage2_38948(__local volatile
                                int64_t *scan_arr_mem_38953_backing_aligned_0,
                                int32_t sizze_31214, int32_t sizze_31215,
                                int32_t num_groups_35162, __global
                                unsigned char *mem_38270)
{
    const int32_t group_sizze_35152 = mainzigroup_sizze_35127;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict scan_arr_mem_38953_backing_0 =
                          scan_arr_mem_38953_backing_aligned_0;
    int32_t global_tid_38948;
    int32_t local_tid_38949;
    int32_t group_sizze_38952;
    int32_t wave_sizze_38951;
    int32_t group_id_38950;
    
    global_tid_38948 = get_global_id(0);
    local_tid_38949 = get_local_id(0);
    group_sizze_38952 = get_local_size(0);
    wave_sizze_38951 = LOCKSTEP_WIDTH;
    group_id_38950 = get_group_id(0);
    
    __local char *scan_arr_mem_38953;
    
    scan_arr_mem_38953 = (__local char *) scan_arr_mem_38953_backing_0;
    
    int32_t flat_idx_38955 = (local_tid_38949 + 1) * (group_sizze_35152 *
                                                      squot32(sizze_31215 *
                                                              sizze_31214 +
                                                              group_sizze_35152 *
                                                              num_groups_35162 -
                                                              1,
                                                              group_sizze_35152 *
                                                              num_groups_35162)) -
            1;
    int32_t gtid_35122 = squot32(flat_idx_38955, sizze_31214);
    int32_t gtid_35144;
    
    gtid_35144 = flat_idx_38955 - squot32(flat_idx_38955, sizze_31214) *
        sizze_31214;
    // threads in bound read carries; others get neutral element
    {
        if (slt32(gtid_35122, sizze_31215) && slt32(gtid_35144, sizze_31214)) {
            *(__local int32_t *) &scan_arr_mem_38953[local_tid_38949 * 4] =
                *(__global int32_t *) &mem_38270[(gtid_35122 * sizze_31214 +
                                                  gtid_35144) * 4];
        } else {
            *(__local int32_t *) &scan_arr_mem_38953[local_tid_38949 * 4] = 0;
        }
    }
    
    int32_t x_38942;
    int32_t x_38943;
    int32_t x_38956;
    int32_t x_38957;
    int32_t skip_threads_38959;
    
    if (slt32(local_tid_38949, num_groups_35162)) {
        x_38943 = *(volatile __local
                    int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                   sizeof(int32_t)];
    }
    // in-block scan (hopefully no barriers needed)
    {
        skip_threads_38959 = 1;
        while (slt32(skip_threads_38959, 32)) {
            if (sle32(skip_threads_38959, local_tid_38949 -
                      squot32(local_tid_38949, 32) * 32) &&
                slt32(local_tid_38949, num_groups_35162)) {
                // read operands
                {
                    x_38942 = *(volatile __local
                                int32_t *) &scan_arr_mem_38953[(local_tid_38949 -
                                                                skip_threads_38959) *
                                                               sizeof(int32_t)];
                }
                // perform operation
                {
                    if (!slt32(srem32((local_tid_38949 + 1) *
                                      (group_sizze_35152 * squot32(sizze_31215 *
                                                                   sizze_31214 +
                                                                   group_sizze_35152 *
                                                                   num_groups_35162 -
                                                                   1,
                                                                   group_sizze_35152 *
                                                                   num_groups_35162)) -
                                      1, sizze_31214), (local_tid_38949 + 1) *
                               (group_sizze_35152 * squot32(sizze_31215 *
                                                            sizze_31214 +
                                                            group_sizze_35152 *
                                                            num_groups_35162 -
                                                            1,
                                                            group_sizze_35152 *
                                                            num_groups_35162)) -
                               1 - ((local_tid_38949 - skip_threads_38959 + 1) *
                                    (group_sizze_35152 * squot32(sizze_31215 *
                                                                 sizze_31214 +
                                                                 group_sizze_35152 *
                                                                 num_groups_35162 -
                                                                 1,
                                                                 group_sizze_35152 *
                                                                 num_groups_35162)) -
                                    1))) {
                        int32_t res_38944 = x_38942 + x_38943;
                        
                        x_38943 = res_38944;
                    }
                }
            }
            if (sle32(wave_sizze_38951, skip_threads_38959)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            if (sle32(skip_threads_38959, local_tid_38949 -
                      squot32(local_tid_38949, 32) * 32) &&
                slt32(local_tid_38949, num_groups_35162)) {
                // write result
                {
                    *(volatile __local
                      int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                     sizeof(int32_t)] = x_38943;
                }
            }
            if (sle32(wave_sizze_38951, skip_threads_38959)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            skip_threads_38959 *= 2;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // last thread of block 'i' writes its result to offset 'i'
    {
        if ((local_tid_38949 - squot32(local_tid_38949, 32) * 32) == 31 &&
            slt32(local_tid_38949, num_groups_35162)) {
            *(volatile __local
              int32_t *) &scan_arr_mem_38953[squot32(local_tid_38949, 32) *
                                             sizeof(int32_t)] = x_38943;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
    {
        int32_t skip_threads_38960;
        
        if (squot32(local_tid_38949, 32) == 0 && slt32(local_tid_38949,
                                                       num_groups_35162)) {
            x_38957 = *(volatile __local
                        int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                       sizeof(int32_t)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_38960 = 1;
            while (slt32(skip_threads_38960, 32)) {
                if (sle32(skip_threads_38960, local_tid_38949 -
                          squot32(local_tid_38949, 32) * 32) &&
                    (squot32(local_tid_38949, 32) == 0 && slt32(local_tid_38949,
                                                                num_groups_35162))) {
                    // read operands
                    {
                        x_38956 = *(volatile __local
                                    int32_t *) &scan_arr_mem_38953[(local_tid_38949 -
                                                                    skip_threads_38960) *
                                                                   sizeof(int32_t)];
                    }
                    // perform operation
                    {
                        if (!slt32(srem32((local_tid_38949 * 32 + 32 - 1 + 1) *
                                          (group_sizze_35152 *
                                           squot32(sizze_31215 * sizze_31214 +
                                                   group_sizze_35152 *
                                                   num_groups_35162 - 1,
                                                   group_sizze_35152 *
                                                   num_groups_35162)) - 1,
                                          sizze_31214), (local_tid_38949 * 32 +
                                                         32 - 1 + 1) *
                                   (group_sizze_35152 * squot32(sizze_31215 *
                                                                sizze_31214 +
                                                                group_sizze_35152 *
                                                                num_groups_35162 -
                                                                1,
                                                                group_sizze_35152 *
                                                                num_groups_35162)) -
                                   1 - (((local_tid_38949 -
                                          skip_threads_38960) * 32 + 32 - 1 +
                                         1) * (group_sizze_35152 *
                                               squot32(sizze_31215 *
                                                       sizze_31214 +
                                                       group_sizze_35152 *
                                                       num_groups_35162 - 1,
                                                       group_sizze_35152 *
                                                       num_groups_35162)) -
                                        1))) {
                            int32_t res_38958 = x_38956 + x_38957;
                            
                            x_38957 = res_38958;
                        }
                    }
                }
                if (sle32(wave_sizze_38951, skip_threads_38960)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_38960, local_tid_38949 -
                          squot32(local_tid_38949, 32) * 32) &&
                    (squot32(local_tid_38949, 32) == 0 && slt32(local_tid_38949,
                                                                num_groups_35162))) {
                    // write result
                    {
                        *(volatile __local
                          int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                         sizeof(int32_t)] =
                            x_38957;
                    }
                }
                if (sle32(wave_sizze_38951, skip_threads_38960)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_38960 *= 2;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // carry-in for every block except the first
    {
        if (!(squot32(local_tid_38949, 32) == 0 || !slt32(local_tid_38949,
                                                          num_groups_35162))) {
            // read operands
            {
                x_38942 = *(volatile __local
                            int32_t *) &scan_arr_mem_38953[(squot32(local_tid_38949,
                                                                    32) - 1) *
                                                           sizeof(int32_t)];
            }
            // perform operation
            {
                if (!slt32(srem32((local_tid_38949 + 1) * (group_sizze_35152 *
                                                           squot32(sizze_31215 *
                                                                   sizze_31214 +
                                                                   group_sizze_35152 *
                                                                   num_groups_35162 -
                                                                   1,
                                                                   group_sizze_35152 *
                                                                   num_groups_35162)) -
                                  1, sizze_31214), (local_tid_38949 + 1) *
                           (group_sizze_35152 * squot32(sizze_31215 *
                                                        sizze_31214 +
                                                        group_sizze_35152 *
                                                        num_groups_35162 - 1,
                                                        group_sizze_35152 *
                                                        num_groups_35162)) - 1 -
                           ((squot32(local_tid_38949, 32) * 32 - 1 + 1) *
                            (group_sizze_35152 * squot32(sizze_31215 *
                                                         sizze_31214 +
                                                         group_sizze_35152 *
                                                         num_groups_35162 - 1,
                                                         group_sizze_35152 *
                                                         num_groups_35162)) -
                            1))) {
                    int32_t res_38944 = x_38942 + x_38943;
                    
                    x_38943 = res_38944;
                }
            }
            // write final result
            {
                *(volatile __local
                  int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                 sizeof(int32_t)] = x_38943;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // restore correct values for first block
    {
        if (squot32(local_tid_38949, 32) == 0) {
            *(volatile __local int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                              sizeof(int32_t)] =
                x_38943;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // threads in bounds write scanned carries
    {
        if (slt32(gtid_35122, sizze_31215) && slt32(gtid_35144, sizze_31214)) {
            *(__global int32_t *) &mem_38270[(gtid_35122 * sizze_31214 +
                                              gtid_35144) * 4] = *(__local
                                                                   int32_t *) &scan_arr_mem_38953[local_tid_38949 *
                                                                                                  4];
        }
    }
}
__kernel void scan_stage2_39203(__local volatile
                                int64_t *scan_arr_mem_39208_backing_aligned_0,
                                int32_t sizze_31215, int32_t arg_31616,
                                int32_t num_groups_36538, __global
                                unsigned char *mem_38441)
{
    const int32_t group_sizze_36528 = mainzigroup_sizze_36446;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    __local volatile char *restrict scan_arr_mem_39208_backing_0 =
                          scan_arr_mem_39208_backing_aligned_0;
    int32_t global_tid_39203;
    int32_t local_tid_39204;
    int32_t group_sizze_39207;
    int32_t wave_sizze_39206;
    int32_t group_id_39205;
    
    global_tid_39203 = get_global_id(0);
    local_tid_39204 = get_local_id(0);
    group_sizze_39207 = get_local_size(0);
    wave_sizze_39206 = LOCKSTEP_WIDTH;
    group_id_39205 = get_group_id(0);
    
    __local char *scan_arr_mem_39208;
    
    scan_arr_mem_39208 = (__local char *) scan_arr_mem_39208_backing_0;
    
    int32_t flat_idx_39210 = (local_tid_39204 + 1) * (group_sizze_36528 *
                                                      squot32(sizze_31215 *
                                                              arg_31616 +
                                                              group_sizze_36528 *
                                                              num_groups_36538 -
                                                              1,
                                                              group_sizze_36528 *
                                                              num_groups_36538)) -
            1;
    int32_t gtid_36442 = squot32(flat_idx_39210, arg_31616);
    int32_t gtid_36463;
    
    gtid_36463 = flat_idx_39210 - squot32(flat_idx_39210, arg_31616) *
        arg_31616;
    // threads in bound read carries; others get neutral element
    {
        if (slt32(gtid_36442, sizze_31215) && slt32(gtid_36463, arg_31616)) {
            *(__local float *) &scan_arr_mem_39208[local_tid_39204 * 4] =
                *(__global float *) &mem_38441[(gtid_36442 * arg_31616 +
                                                gtid_36463) * 4];
        } else {
            *(__local float *) &scan_arr_mem_39208[local_tid_39204 * 4] = 0.0F;
        }
    }
    
    float x_39197;
    float x_39198;
    float x_39211;
    float x_39212;
    int32_t skip_threads_39214;
    
    if (slt32(local_tid_39204, num_groups_36538)) {
        x_39198 = *(volatile __local
                    float *) &scan_arr_mem_39208[local_tid_39204 *
                                                 sizeof(float)];
    }
    // in-block scan (hopefully no barriers needed)
    {
        skip_threads_39214 = 1;
        while (slt32(skip_threads_39214, 32)) {
            if (sle32(skip_threads_39214, local_tid_39204 -
                      squot32(local_tid_39204, 32) * 32) &&
                slt32(local_tid_39204, num_groups_36538)) {
                // read operands
                {
                    x_39197 = *(volatile __local
                                float *) &scan_arr_mem_39208[(local_tid_39204 -
                                                              skip_threads_39214) *
                                                             sizeof(float)];
                }
                // perform operation
                {
                    if (!slt32(srem32((local_tid_39204 + 1) *
                                      (group_sizze_36528 * squot32(sizze_31215 *
                                                                   arg_31616 +
                                                                   group_sizze_36528 *
                                                                   num_groups_36538 -
                                                                   1,
                                                                   group_sizze_36528 *
                                                                   num_groups_36538)) -
                                      1, arg_31616), (local_tid_39204 + 1) *
                               (group_sizze_36528 * squot32(sizze_31215 *
                                                            arg_31616 +
                                                            group_sizze_36528 *
                                                            num_groups_36538 -
                                                            1,
                                                            group_sizze_36528 *
                                                            num_groups_36538)) -
                               1 - ((local_tid_39204 - skip_threads_39214 + 1) *
                                    (group_sizze_36528 * squot32(sizze_31215 *
                                                                 arg_31616 +
                                                                 group_sizze_36528 *
                                                                 num_groups_36538 -
                                                                 1,
                                                                 group_sizze_36528 *
                                                                 num_groups_36538)) -
                                    1))) {
                        float res_39199 = x_39197 + x_39198;
                        
                        x_39198 = res_39199;
                    }
                }
            }
            if (sle32(wave_sizze_39206, skip_threads_39214)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            if (sle32(skip_threads_39214, local_tid_39204 -
                      squot32(local_tid_39204, 32) * 32) &&
                slt32(local_tid_39204, num_groups_36538)) {
                // write result
                {
                    *(volatile __local
                      float *) &scan_arr_mem_39208[local_tid_39204 *
                                                   sizeof(float)] = x_39198;
                }
            }
            if (sle32(wave_sizze_39206, skip_threads_39214)) {
                barrier(CLK_LOCAL_MEM_FENCE);
            }
            skip_threads_39214 *= 2;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // last thread of block 'i' writes its result to offset 'i'
    {
        if ((local_tid_39204 - squot32(local_tid_39204, 32) * 32) == 31 &&
            slt32(local_tid_39204, num_groups_36538)) {
            *(volatile __local
              float *) &scan_arr_mem_39208[squot32(local_tid_39204, 32) *
                                           sizeof(float)] = x_39198;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
    {
        int32_t skip_threads_39215;
        
        if (squot32(local_tid_39204, 32) == 0 && slt32(local_tid_39204,
                                                       num_groups_36538)) {
            x_39212 = *(volatile __local
                        float *) &scan_arr_mem_39208[local_tid_39204 *
                                                     sizeof(float)];
        }
        // in-block scan (hopefully no barriers needed)
        {
            skip_threads_39215 = 1;
            while (slt32(skip_threads_39215, 32)) {
                if (sle32(skip_threads_39215, local_tid_39204 -
                          squot32(local_tid_39204, 32) * 32) &&
                    (squot32(local_tid_39204, 32) == 0 && slt32(local_tid_39204,
                                                                num_groups_36538))) {
                    // read operands
                    {
                        x_39211 = *(volatile __local
                                    float *) &scan_arr_mem_39208[(local_tid_39204 -
                                                                  skip_threads_39215) *
                                                                 sizeof(float)];
                    }
                    // perform operation
                    {
                        if (!slt32(srem32((local_tid_39204 * 32 + 32 - 1 + 1) *
                                          (group_sizze_36528 *
                                           squot32(sizze_31215 * arg_31616 +
                                                   group_sizze_36528 *
                                                   num_groups_36538 - 1,
                                                   group_sizze_36528 *
                                                   num_groups_36538)) - 1,
                                          arg_31616), (local_tid_39204 * 32 +
                                                       32 - 1 + 1) *
                                   (group_sizze_36528 * squot32(sizze_31215 *
                                                                arg_31616 +
                                                                group_sizze_36528 *
                                                                num_groups_36538 -
                                                                1,
                                                                group_sizze_36528 *
                                                                num_groups_36538)) -
                                   1 - (((local_tid_39204 -
                                          skip_threads_39215) * 32 + 32 - 1 +
                                         1) * (group_sizze_36528 *
                                               squot32(sizze_31215 * arg_31616 +
                                                       group_sizze_36528 *
                                                       num_groups_36538 - 1,
                                                       group_sizze_36528 *
                                                       num_groups_36538)) -
                                        1))) {
                            float res_39213 = x_39211 + x_39212;
                            
                            x_39212 = res_39213;
                        }
                    }
                }
                if (sle32(wave_sizze_39206, skip_threads_39215)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                if (sle32(skip_threads_39215, local_tid_39204 -
                          squot32(local_tid_39204, 32) * 32) &&
                    (squot32(local_tid_39204, 32) == 0 && slt32(local_tid_39204,
                                                                num_groups_36538))) {
                    // write result
                    {
                        *(volatile __local
                          float *) &scan_arr_mem_39208[local_tid_39204 *
                                                       sizeof(float)] = x_39212;
                    }
                }
                if (sle32(wave_sizze_39206, skip_threads_39215)) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                }
                skip_threads_39215 *= 2;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // carry-in for every block except the first
    {
        if (!(squot32(local_tid_39204, 32) == 0 || !slt32(local_tid_39204,
                                                          num_groups_36538))) {
            // read operands
            {
                x_39197 = *(volatile __local
                            float *) &scan_arr_mem_39208[(squot32(local_tid_39204,
                                                                  32) - 1) *
                                                         sizeof(float)];
            }
            // perform operation
            {
                if (!slt32(srem32((local_tid_39204 + 1) * (group_sizze_36528 *
                                                           squot32(sizze_31215 *
                                                                   arg_31616 +
                                                                   group_sizze_36528 *
                                                                   num_groups_36538 -
                                                                   1,
                                                                   group_sizze_36528 *
                                                                   num_groups_36538)) -
                                  1, arg_31616), (local_tid_39204 + 1) *
                           (group_sizze_36528 * squot32(sizze_31215 *
                                                        arg_31616 +
                                                        group_sizze_36528 *
                                                        num_groups_36538 - 1,
                                                        group_sizze_36528 *
                                                        num_groups_36538)) - 1 -
                           ((squot32(local_tid_39204, 32) * 32 - 1 + 1) *
                            (group_sizze_36528 * squot32(sizze_31215 *
                                                         arg_31616 +
                                                         group_sizze_36528 *
                                                         num_groups_36538 - 1,
                                                         group_sizze_36528 *
                                                         num_groups_36538)) -
                            1))) {
                    float res_39199 = x_39197 + x_39198;
                    
                    x_39198 = res_39199;
                }
            }
            // write final result
            {
                *(volatile __local
                  float *) &scan_arr_mem_39208[local_tid_39204 *
                                               sizeof(float)] = x_39198;
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // restore correct values for first block
    {
        if (squot32(local_tid_39204, 32) == 0) {
            *(volatile __local float *) &scan_arr_mem_39208[local_tid_39204 *
                                                            sizeof(float)] =
                x_39198;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // threads in bounds write scanned carries
    {
        if (slt32(gtid_36442, sizze_31215) && slt32(gtid_36463, arg_31616)) {
            *(__global float *) &mem_38441[(gtid_36442 * arg_31616 +
                                            gtid_36463) * 4] = *(__local
                                                                 float *) &scan_arr_mem_39208[local_tid_39204 *
                                                                                              4];
        }
    }
}
__kernel void scan_stage3_38961(int32_t sizze_31214, int32_t sizze_31215,
                                int32_t num_groups_35162, __global
                                unsigned char *mem_38270)
{
    const int32_t group_sizze_35152 = mainzigroup_sizze_35127;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t scan_gtid_38961;
    int32_t scan_ltid_38962;
    int32_t scan_gid_38963;
    
    scan_gtid_38961 = get_global_id(0);
    scan_ltid_38962 = get_local_id(0);
    scan_gid_38963 = get_group_id(0);
    
    int32_t gtid_35122 = squot32(scan_gtid_38961, sizze_31214);
    int32_t gtid_35144;
    
    gtid_35144 = scan_gtid_38961 - squot32(scan_gtid_38961, sizze_31214) *
        sizze_31214;
    
    int32_t orig_group_38966 = squot32(scan_gtid_38961, group_sizze_35152 *
                                       squot32(sizze_31215 * sizze_31214 +
                                               group_sizze_35152 *
                                               num_groups_35162 - 1,
                                               group_sizze_35152 *
                                               num_groups_35162));
    int32_t carry_in_flat_idx_38967 = orig_group_38966 * (group_sizze_35152 *
                                                          squot32(sizze_31215 *
                                                                  sizze_31214 +
                                                                  group_sizze_35152 *
                                                                  num_groups_35162 -
                                                                  1,
                                                                  group_sizze_35152 *
                                                                  num_groups_35162)) -
            1;
    
    if (slt32(scan_gtid_38961, sizze_31215 * sizze_31214)) {
        if (!(orig_group_38966 == 0 || (scan_gtid_38961 == (orig_group_38966 +
                                                            1) *
                                        (group_sizze_35152 *
                                         squot32(sizze_31215 * sizze_31214 +
                                                 group_sizze_35152 *
                                                 num_groups_35162 - 1,
                                                 group_sizze_35152 *
                                                 num_groups_35162)) - 1 ||
                                        slt32(srem32(scan_gtid_38961,
                                                     sizze_31214),
                                              scan_gtid_38961 -
                                              carry_in_flat_idx_38967)))) {
            int32_t x_38945;
            int32_t x_38946;
            
            x_38945 = *(__global
                        int32_t *) &mem_38270[(squot32(carry_in_flat_idx_38967,
                                                       sizze_31214) *
                                               sizze_31214 +
                                               (carry_in_flat_idx_38967 -
                                                squot32(carry_in_flat_idx_38967,
                                                        sizze_31214) *
                                                sizze_31214)) * 4];
            x_38946 = *(__global int32_t *) &mem_38270[(gtid_35122 *
                                                        sizze_31214 +
                                                        gtid_35144) * 4];
            
            int32_t res_38947;
            
            if (slt32(scan_gtid_38961, sizze_31215 * sizze_31214)) {
                res_38947 = x_38945 + x_38946;
            }
            x_38945 = res_38947;
            *(__global int32_t *) &mem_38270[(gtid_35122 * sizze_31214 +
                                              gtid_35144) * 4] = x_38945;
        }
    }
}
__kernel void scan_stage3_39216(int32_t sizze_31215, int32_t arg_31616,
                                int32_t num_groups_36538, __global
                                unsigned char *mem_38441)
{
    const int32_t group_sizze_36528 = mainzigroup_sizze_36446;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    int32_t scan_gtid_39216;
    int32_t scan_ltid_39217;
    int32_t scan_gid_39218;
    
    scan_gtid_39216 = get_global_id(0);
    scan_ltid_39217 = get_local_id(0);
    scan_gid_39218 = get_group_id(0);
    
    int32_t gtid_36442 = squot32(scan_gtid_39216, arg_31616);
    int32_t gtid_36463;
    
    gtid_36463 = scan_gtid_39216 - squot32(scan_gtid_39216, arg_31616) *
        arg_31616;
    
    int32_t orig_group_39221 = squot32(scan_gtid_39216, group_sizze_36528 *
                                       squot32(sizze_31215 * arg_31616 +
                                               group_sizze_36528 *
                                               num_groups_36538 - 1,
                                               group_sizze_36528 *
                                               num_groups_36538));
    int32_t carry_in_flat_idx_39222 = orig_group_39221 * (group_sizze_36528 *
                                                          squot32(sizze_31215 *
                                                                  arg_31616 +
                                                                  group_sizze_36528 *
                                                                  num_groups_36538 -
                                                                  1,
                                                                  group_sizze_36528 *
                                                                  num_groups_36538)) -
            1;
    
    if (slt32(scan_gtid_39216, sizze_31215 * arg_31616)) {
        if (!(orig_group_39221 == 0 || (scan_gtid_39216 == (orig_group_39221 +
                                                            1) *
                                        (group_sizze_36528 *
                                         squot32(sizze_31215 * arg_31616 +
                                                 group_sizze_36528 *
                                                 num_groups_36538 - 1,
                                                 group_sizze_36528 *
                                                 num_groups_36538)) - 1 ||
                                        slt32(srem32(scan_gtid_39216,
                                                     arg_31616),
                                              scan_gtid_39216 -
                                              carry_in_flat_idx_39222)))) {
            float x_39200;
            float x_39201;
            
            x_39200 = *(__global
                        float *) &mem_38441[(squot32(carry_in_flat_idx_39222,
                                                     arg_31616) * arg_31616 +
                                             (carry_in_flat_idx_39222 -
                                              squot32(carry_in_flat_idx_39222,
                                                      arg_31616) * arg_31616)) *
                                            4];
            x_39201 = *(__global float *) &mem_38441[(gtid_36442 * arg_31616 +
                                                      gtid_36463) * 4];
            
            float res_39202;
            
            if (slt32(scan_gtid_39216, sizze_31215 * arg_31616)) {
                res_39202 = x_39200 + x_39201;
            }
            x_39200 = res_39202;
            *(__global float *) &mem_38441[(gtid_36442 * arg_31616 +
                                            gtid_36463) * 4] = x_39200;
        }
    }
}
__kernel void segred_large_32822(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t sizze_31216, int32_t n_31219,
                                 int32_t res_31237, int32_t num_groups_33119,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *arg_mem_37903, __global
                                 unsigned char *mem_37995, __global
                                 unsigned char *mem_38000,
                                 int32_t thread_per_segment_38667, __global
                                 unsigned char *group_res_arr_mem_38668,
                                 __global unsigned char *counter_mem_38670)
{
    const int32_t group_sizze_33109 = mainzigroup_sizze_32804;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38672_backing_0, 4 *
                         mainzigroup_sizze_32804);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_38674_backing_1, 1);
    
    int32_t global_tid_32822;
    int32_t local_tid_32823;
    int32_t group_sizze_38664;
    int32_t wave_sizze_38663;
    int32_t group_id_32824;
    
    global_tid_32822 = get_global_id(0);
    local_tid_32823 = get_local_id(0);
    group_sizze_38664 = get_local_size(0);
    wave_sizze_38663 = LOCKSTEP_WIDTH;
    group_id_32824 = get_group_id(0);
    
    int32_t gtid_32791;
    int32_t gtid_32792;
    int32_t gtid_32793;
    int32_t gtid_32821;
    __local char *red_arr_mem_38672;
    
    red_arr_mem_38672 = (__local char *) red_arr_mem_38672_backing_0;
    
    __local char *sync_arr_mem_38674;
    
    sync_arr_mem_38674 = (__local char *) sync_arr_mem_38674_backing_1;
    gtid_32791 = squot32(squot32(group_id_32824, squot32(num_groups_33119 +
                                                         smax32(1, sizze_31215 *
                                                                res_31237 *
                                                                res_31237) - 1,
                                                         smax32(1, sizze_31215 *
                                                                res_31237 *
                                                                res_31237))),
                         res_31237 * res_31237);
    gtid_32792 = squot32(squot32(group_id_32824, squot32(num_groups_33119 +
                                                         smax32(1, sizze_31215 *
                                                                res_31237 *
                                                                res_31237) - 1,
                                                         smax32(1, sizze_31215 *
                                                                res_31237 *
                                                                res_31237))) -
                         squot32(squot32(group_id_32824,
                                         squot32(num_groups_33119 + smax32(1,
                                                                           sizze_31215 *
                                                                           res_31237 *
                                                                           res_31237) -
                                                 1, smax32(1, sizze_31215 *
                                                           res_31237 *
                                                           res_31237))),
                                 res_31237 * res_31237) * (res_31237 *
                                                           res_31237),
                         res_31237);
    gtid_32793 = squot32(group_id_32824, squot32(num_groups_33119 + smax32(1,
                                                                           sizze_31215 *
                                                                           res_31237 *
                                                                           res_31237) -
                                                 1, smax32(1, sizze_31215 *
                                                           res_31237 *
                                                           res_31237))) -
        squot32(squot32(group_id_32824, squot32(num_groups_33119 + smax32(1,
                                                                          sizze_31215 *
                                                                          res_31237 *
                                                                          res_31237) -
                                                1, smax32(1, sizze_31215 *
                                                          res_31237 *
                                                          res_31237))),
                res_31237 * res_31237) * (res_31237 * res_31237) -
        squot32(squot32(group_id_32824, squot32(num_groups_33119 + smax32(1,
                                                                          sizze_31215 *
                                                                          res_31237 *
                                                                          res_31237) -
                                                1, smax32(1, sizze_31215 *
                                                          res_31237 *
                                                          res_31237))) -
                squot32(squot32(group_id_32824, squot32(num_groups_33119 +
                                                        smax32(1, sizze_31215 *
                                                               res_31237 *
                                                               res_31237) - 1,
                                                        smax32(1, sizze_31215 *
                                                               res_31237 *
                                                               res_31237))),
                        res_31237 * res_31237) * (res_31237 * res_31237),
                res_31237) * res_31237;
    
    int32_t chunk_sizze_38676 = smin32(squot32(n_31219 + group_sizze_33109 *
                                               squot32(num_groups_33119 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237 *
                                                              res_31237)) - 1,
                                               group_sizze_33109 *
                                               squot32(num_groups_33119 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237 *
                                                              res_31237))),
                                       squot32(n_31219 -
                                               srem32(global_tid_32822,
                                                      group_sizze_33109 *
                                                      squot32(num_groups_33119 +
                                                              smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237 *
                                                                     res_31237) -
                                                              1, smax32(1,
                                                                        sizze_31215 *
                                                                        res_31237 *
                                                                        res_31237))) +
                                               thread_per_segment_38667 - 1,
                                               thread_per_segment_38667));
    float x_33125;
    float x_33126;
    
    x_33125 = 0.0F;
    for (int32_t i_38680 = 0; i_38680 < chunk_sizze_38676; i_38680++) {
        gtid_32821 = srem32(global_tid_32822, group_sizze_33109 *
                            squot32(num_groups_33119 + smax32(1, sizze_31215 *
                                                              res_31237 *
                                                              res_31237) - 1,
                                    smax32(1, sizze_31215 * res_31237 *
                                           res_31237))) +
            thread_per_segment_38667 * i_38680;
        // apply map function
        {
            float x_33131;
            float x_33132;
            float x_33133;
            float x_33134;
            bool res_33135;
            float y_33136;
            float res_33137;
            
            x_33131 = *(__global float *) &images_mem_37894[(gtid_32791 *
                                                             sizze_31216 +
                                                             gtid_32821) * 4];
            x_33132 = *(__global float *) &arg_mem_37903[(gtid_32792 *
                                                          sizze_31214 +
                                                          gtid_32821) * 4];
            x_33133 = *(__global float *) &mem_37995[(gtid_32793 * sizze_31214 +
                                                      gtid_32821) * 4];
            x_33134 = x_33132 * x_33133;
            res_33135 = futrts_isnan32(x_33131);
            if (res_33135) {
                y_33136 = 0.0F;
            } else {
                y_33136 = 1.0F;
            }
            res_33137 = x_33134 * y_33136;
            // save results to be reduced
            {
                x_33126 = res_33137;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_33127 = x_33125 + x_33126;
                
                x_33125 = res_33127;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_38672[local_tid_32823 * 4] = x_33125;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38681;
    int32_t skip_waves_38682;
    float x_38677;
    float x_38678;
    
    offset_38681 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_32823, group_sizze_33109)) {
            x_38677 = *(__local float *) &red_arr_mem_38672[(local_tid_32823 +
                                                             offset_38681) * 4];
        }
    }
    offset_38681 = 1;
    while (slt32(offset_38681, wave_sizze_38663)) {
        if (slt32(local_tid_32823 + offset_38681, group_sizze_33109) &&
            ((local_tid_32823 - squot32(local_tid_32823, wave_sizze_38663) *
              wave_sizze_38663) & (2 * offset_38681 - 1)) == 0) {
            // read array element
            {
                x_38678 = *(volatile __local
                            float *) &red_arr_mem_38672[(local_tid_32823 +
                                                         offset_38681) * 4];
            }
            // apply reduction operation
            {
                float res_38679 = x_38677 + x_38678;
                
                x_38677 = res_38679;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_38672[local_tid_32823 *
                                                               4] = x_38677;
            }
        }
        offset_38681 *= 2;
    }
    skip_waves_38682 = 1;
    while (slt32(skip_waves_38682, squot32(group_sizze_33109 +
                                           wave_sizze_38663 - 1,
                                           wave_sizze_38663))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38681 = skip_waves_38682 * wave_sizze_38663;
        if (slt32(local_tid_32823 + offset_38681, group_sizze_33109) &&
            ((local_tid_32823 - squot32(local_tid_32823, wave_sizze_38663) *
              wave_sizze_38663) == 0 && (squot32(local_tid_32823,
                                                 wave_sizze_38663) & (2 *
                                                                      skip_waves_38682 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_38678 = *(__local
                            float *) &red_arr_mem_38672[(local_tid_32823 +
                                                         offset_38681) * 4];
            }
            // apply reduction operation
            {
                float res_38679 = x_38677 + x_38678;
                
                x_38677 = res_38679;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_38672[local_tid_32823 * 4] =
                    x_38677;
            }
        }
        skip_waves_38682 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_33119 + smax32(1, sizze_31215 * res_31237 *
                                          res_31237) - 1, smax32(1,
                                                                 sizze_31215 *
                                                                 res_31237 *
                                                                 res_31237)) ==
        1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_32823 == 0) {
                *(__global float *) &mem_38000[(gtid_32791 * (res_31237 *
                                                              res_31237) +
                                                gtid_32792 * res_31237 +
                                                gtid_32793) * 4] = x_38677;
            }
        }
    } else {
        int32_t old_counter_38683;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_32823 == 0) {
                *(__global float *) &group_res_arr_mem_38668[group_id_32824 *
                                                             4] = x_38677;
                mem_fence_global();
                old_counter_38683 = atomic_add((volatile __global
                                                int *) &counter_mem_38670[srem32(squot32(group_id_32824,
                                                                                         squot32(num_groups_33119 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_38674[0] = old_counter_38683 ==
                    squot32(num_groups_33119 + smax32(1, sizze_31215 *
                                                      res_31237 * res_31237) -
                            1, smax32(1, sizze_31215 * res_31237 * res_31237)) -
                    1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_38684 = *(__local bool *) &sync_arr_mem_38674[0];
        
        if (is_last_group_38684) {
            if (local_tid_32823 == 0) {
                old_counter_38683 = atomic_add((volatile __global
                                                int *) &counter_mem_38670[srem32(squot32(group_id_32824,
                                                                                         squot32(num_groups_33119 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_33119 +
                                                           smax32(1,
                                                                  sizze_31215 *
                                                                  res_31237 *
                                                                  res_31237) -
                                                           1, smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237 *
                                                                     res_31237)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_32823, squot32(num_groups_33119 + smax32(1,
                                                                             sizze_31215 *
                                                                             res_31237 *
                                                                             res_31237) -
                                                   1, smax32(1, sizze_31215 *
                                                             res_31237 *
                                                             res_31237)))) {
                    x_33125 = *(__global
                                float *) &group_res_arr_mem_38668[(squot32(group_id_32824,
                                                                           squot32(num_groups_33119 +
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237 *
                                                                                          res_31237) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237 *
                                                                                          res_31237))) *
                                                                   squot32(num_groups_33119 +
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237 *
                                                                                  res_31237) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237 *
                                                                                  res_31237)) +
                                                                   local_tid_32823) *
                                                                  4];
                } else {
                    x_33125 = 0.0F;
                }
                *(__local float *) &red_arr_mem_38672[local_tid_32823 * 4] =
                    x_33125;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_38685;
                int32_t skip_waves_38686;
                float x_38677;
                float x_38678;
                
                offset_38685 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_32823, group_sizze_33109)) {
                        x_38677 = *(__local
                                    float *) &red_arr_mem_38672[(local_tid_32823 +
                                                                 offset_38685) *
                                                                4];
                    }
                }
                offset_38685 = 1;
                while (slt32(offset_38685, wave_sizze_38663)) {
                    if (slt32(local_tid_32823 + offset_38685,
                              group_sizze_33109) && ((local_tid_32823 -
                                                      squot32(local_tid_32823,
                                                              wave_sizze_38663) *
                                                      wave_sizze_38663) & (2 *
                                                                           offset_38685 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_38678 = *(volatile __local
                                        float *) &red_arr_mem_38672[(local_tid_32823 +
                                                                     offset_38685) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38679 = x_38677 + x_38678;
                            
                            x_38677 = res_38679;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38672[local_tid_32823 * 4] =
                                x_38677;
                        }
                    }
                    offset_38685 *= 2;
                }
                skip_waves_38686 = 1;
                while (slt32(skip_waves_38686, squot32(group_sizze_33109 +
                                                       wave_sizze_38663 - 1,
                                                       wave_sizze_38663))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_38685 = skip_waves_38686 * wave_sizze_38663;
                    if (slt32(local_tid_32823 + offset_38685,
                              group_sizze_33109) && ((local_tid_32823 -
                                                      squot32(local_tid_32823,
                                                              wave_sizze_38663) *
                                                      wave_sizze_38663) == 0 &&
                                                     (squot32(local_tid_32823,
                                                              wave_sizze_38663) &
                                                      (2 * skip_waves_38686 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_38678 = *(__local
                                        float *) &red_arr_mem_38672[(local_tid_32823 +
                                                                     offset_38685) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38679 = x_38677 + x_38678;
                            
                            x_38677 = res_38679;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_38672[local_tid_32823 * 4] =
                                x_38677;
                        }
                    }
                    skip_waves_38686 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_32823 == 0) {
                        *(__global float *) &mem_38000[(gtid_32791 *
                                                        (res_31237 *
                                                         res_31237) +
                                                        gtid_32792 * res_31237 +
                                                        gtid_32793) * 4] =
                            x_38677;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_34038(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t sizze_31216, int32_t n_31219,
                                 int32_t res_31237, int32_t num_groups_34145,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *arg_mem_37903, __global
                                 unsigned char *mem_38082,
                                 int32_t thread_per_segment_38752, __global
                                 unsigned char *group_res_arr_mem_38753,
                                 __global unsigned char *counter_mem_38755)
{
    const int32_t group_sizze_34135 = mainzigroup_sizze_34020;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38757_backing_0, 4 *
                         mainzigroup_sizze_34020);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_38759_backing_1, 1);
    
    int32_t global_tid_34038;
    int32_t local_tid_34039;
    int32_t group_sizze_38749;
    int32_t wave_sizze_38748;
    int32_t group_id_34040;
    
    global_tid_34038 = get_global_id(0);
    local_tid_34039 = get_local_id(0);
    group_sizze_38749 = get_local_size(0);
    wave_sizze_38748 = LOCKSTEP_WIDTH;
    group_id_34040 = get_group_id(0);
    
    int32_t gtid_34011;
    int32_t gtid_34012;
    int32_t gtid_34037;
    __local char *red_arr_mem_38757;
    
    red_arr_mem_38757 = (__local char *) red_arr_mem_38757_backing_0;
    
    __local char *sync_arr_mem_38759;
    
    sync_arr_mem_38759 = (__local char *) sync_arr_mem_38759_backing_1;
    gtid_34011 = squot32(squot32(group_id_34040, squot32(num_groups_34145 +
                                                         smax32(1, sizze_31215 *
                                                                res_31237) - 1,
                                                         smax32(1, sizze_31215 *
                                                                res_31237))),
                         res_31237);
    gtid_34012 = squot32(group_id_34040, squot32(num_groups_34145 + smax32(1,
                                                                           sizze_31215 *
                                                                           res_31237) -
                                                 1, smax32(1, sizze_31215 *
                                                           res_31237))) -
        squot32(squot32(group_id_34040, squot32(num_groups_34145 + smax32(1,
                                                                          sizze_31215 *
                                                                          res_31237) -
                                                1, smax32(1, sizze_31215 *
                                                          res_31237))),
                res_31237) * res_31237;
    
    int32_t chunk_sizze_38761 = smin32(squot32(n_31219 + group_sizze_34135 *
                                               squot32(num_groups_34145 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237)) - 1,
                                               group_sizze_34135 *
                                               squot32(num_groups_34145 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237))),
                                       squot32(n_31219 -
                                               srem32(global_tid_34038,
                                                      group_sizze_34135 *
                                                      squot32(num_groups_34145 +
                                                              smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237) -
                                                              1, smax32(1,
                                                                        sizze_31215 *
                                                                        res_31237))) +
                                               thread_per_segment_38752 - 1,
                                               thread_per_segment_38752));
    float x_34151;
    float x_34152;
    
    x_34151 = 0.0F;
    for (int32_t i_38765 = 0; i_38765 < chunk_sizze_38761; i_38765++) {
        gtid_34037 = srem32(global_tid_34038, group_sizze_34135 *
                            squot32(num_groups_34145 + smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                    smax32(1, sizze_31215 * res_31237))) +
            thread_per_segment_38752 * i_38765;
        // apply map function
        {
            float x_34156;
            float x_34157;
            bool res_34158;
            float res_34159;
            
            x_34156 = *(__global float *) &arg_mem_37903[(gtid_34012 *
                                                          sizze_31214 +
                                                          gtid_34037) * 4];
            x_34157 = *(__global float *) &images_mem_37894[(gtid_34011 *
                                                             sizze_31216 +
                                                             gtid_34037) * 4];
            res_34158 = futrts_isnan32(x_34157);
            if (res_34158) {
                res_34159 = 0.0F;
            } else {
                float res_34160 = x_34156 * x_34157;
                
                res_34159 = res_34160;
            }
            // save results to be reduced
            {
                x_34152 = res_34159;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_34153 = x_34151 + x_34152;
                
                x_34151 = res_34153;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_38757[local_tid_34039 * 4] = x_34151;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38766;
    int32_t skip_waves_38767;
    float x_38762;
    float x_38763;
    
    offset_38766 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_34039, group_sizze_34135)) {
            x_38762 = *(__local float *) &red_arr_mem_38757[(local_tid_34039 +
                                                             offset_38766) * 4];
        }
    }
    offset_38766 = 1;
    while (slt32(offset_38766, wave_sizze_38748)) {
        if (slt32(local_tid_34039 + offset_38766, group_sizze_34135) &&
            ((local_tid_34039 - squot32(local_tid_34039, wave_sizze_38748) *
              wave_sizze_38748) & (2 * offset_38766 - 1)) == 0) {
            // read array element
            {
                x_38763 = *(volatile __local
                            float *) &red_arr_mem_38757[(local_tid_34039 +
                                                         offset_38766) * 4];
            }
            // apply reduction operation
            {
                float res_38764 = x_38762 + x_38763;
                
                x_38762 = res_38764;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_38757[local_tid_34039 *
                                                               4] = x_38762;
            }
        }
        offset_38766 *= 2;
    }
    skip_waves_38767 = 1;
    while (slt32(skip_waves_38767, squot32(group_sizze_34135 +
                                           wave_sizze_38748 - 1,
                                           wave_sizze_38748))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38766 = skip_waves_38767 * wave_sizze_38748;
        if (slt32(local_tid_34039 + offset_38766, group_sizze_34135) &&
            ((local_tid_34039 - squot32(local_tid_34039, wave_sizze_38748) *
              wave_sizze_38748) == 0 && (squot32(local_tid_34039,
                                                 wave_sizze_38748) & (2 *
                                                                      skip_waves_38767 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_38763 = *(__local
                            float *) &red_arr_mem_38757[(local_tid_34039 +
                                                         offset_38766) * 4];
            }
            // apply reduction operation
            {
                float res_38764 = x_38762 + x_38763;
                
                x_38762 = res_38764;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_38757[local_tid_34039 * 4] =
                    x_38762;
            }
        }
        skip_waves_38767 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_34145 + smax32(1, sizze_31215 * res_31237) - 1,
                smax32(1, sizze_31215 * res_31237)) == 1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_34039 == 0) {
                *(__global float *) &mem_38082[(gtid_34011 * res_31237 +
                                                gtid_34012) * 4] = x_38762;
            }
        }
    } else {
        int32_t old_counter_38768;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_34039 == 0) {
                *(__global float *) &group_res_arr_mem_38753[group_id_34040 *
                                                             4] = x_38762;
                mem_fence_global();
                old_counter_38768 = atomic_add((volatile __global
                                                int *) &counter_mem_38755[srem32(squot32(group_id_34040,
                                                                                         squot32(num_groups_34145 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_38759[0] = old_counter_38768 ==
                    squot32(num_groups_34145 + smax32(1, sizze_31215 *
                                                      res_31237) - 1, smax32(1,
                                                                             sizze_31215 *
                                                                             res_31237)) -
                    1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_38769 = *(__local bool *) &sync_arr_mem_38759[0];
        
        if (is_last_group_38769) {
            if (local_tid_34039 == 0) {
                old_counter_38768 = atomic_add((volatile __global
                                                int *) &counter_mem_38755[srem32(squot32(group_id_34040,
                                                                                         squot32(num_groups_34145 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_34145 +
                                                           smax32(1,
                                                                  sizze_31215 *
                                                                  res_31237) -
                                                           1, smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_34039, squot32(num_groups_34145 + smax32(1,
                                                                             sizze_31215 *
                                                                             res_31237) -
                                                   1, smax32(1, sizze_31215 *
                                                             res_31237)))) {
                    x_34151 = *(__global
                                float *) &group_res_arr_mem_38753[(squot32(group_id_34040,
                                                                           squot32(num_groups_34145 +
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237))) *
                                                                   squot32(num_groups_34145 +
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237)) +
                                                                   local_tid_34039) *
                                                                  4];
                } else {
                    x_34151 = 0.0F;
                }
                *(__local float *) &red_arr_mem_38757[local_tid_34039 * 4] =
                    x_34151;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_38770;
                int32_t skip_waves_38771;
                float x_38762;
                float x_38763;
                
                offset_38770 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_34039, group_sizze_34135)) {
                        x_38762 = *(__local
                                    float *) &red_arr_mem_38757[(local_tid_34039 +
                                                                 offset_38770) *
                                                                4];
                    }
                }
                offset_38770 = 1;
                while (slt32(offset_38770, wave_sizze_38748)) {
                    if (slt32(local_tid_34039 + offset_38770,
                              group_sizze_34135) && ((local_tid_34039 -
                                                      squot32(local_tid_34039,
                                                              wave_sizze_38748) *
                                                      wave_sizze_38748) & (2 *
                                                                           offset_38770 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_38763 = *(volatile __local
                                        float *) &red_arr_mem_38757[(local_tid_34039 +
                                                                     offset_38770) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38764 = x_38762 + x_38763;
                            
                            x_38762 = res_38764;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38757[local_tid_34039 * 4] =
                                x_38762;
                        }
                    }
                    offset_38770 *= 2;
                }
                skip_waves_38771 = 1;
                while (slt32(skip_waves_38771, squot32(group_sizze_34135 +
                                                       wave_sizze_38748 - 1,
                                                       wave_sizze_38748))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_38770 = skip_waves_38771 * wave_sizze_38748;
                    if (slt32(local_tid_34039 + offset_38770,
                              group_sizze_34135) && ((local_tid_34039 -
                                                      squot32(local_tid_34039,
                                                              wave_sizze_38748) *
                                                      wave_sizze_38748) == 0 &&
                                                     (squot32(local_tid_34039,
                                                              wave_sizze_38748) &
                                                      (2 * skip_waves_38771 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_38763 = *(__local
                                        float *) &red_arr_mem_38757[(local_tid_34039 +
                                                                     offset_38770) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38764 = x_38762 + x_38763;
                            
                            x_38762 = res_38764;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_38757[local_tid_34039 * 4] =
                                x_38762;
                        }
                    }
                    skip_waves_38771 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_34039 == 0) {
                        *(__global float *) &mem_38082[(gtid_34011 * res_31237 +
                                                        gtid_34012) * 4] =
                            x_38762;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_34370(int32_t sizze_31215, int32_t res_31237,
                                 int32_t j_m_i_31370, int32_t num_groups_34471,
                                 __global unsigned char *res_mem_38037, __global
                                 unsigned char *res_mem_38086, __global
                                 unsigned char *mem_38138,
                                 int32_t thread_per_segment_38813, __global
                                 unsigned char *group_res_arr_mem_38814,
                                 __global unsigned char *counter_mem_38816)
{
    const int32_t group_sizze_34461 = mainzigroup_sizze_34352;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38818_backing_0, 4 *
                         mainzigroup_sizze_34352);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_38820_backing_1, 1);
    
    int32_t global_tid_34370;
    int32_t local_tid_34371;
    int32_t group_sizze_38810;
    int32_t wave_sizze_38809;
    int32_t group_id_34372;
    
    global_tid_34370 = get_global_id(0);
    local_tid_34371 = get_local_id(0);
    group_sizze_38810 = get_local_size(0);
    wave_sizze_38809 = LOCKSTEP_WIDTH;
    group_id_34372 = get_group_id(0);
    
    int32_t gtid_34344;
    int32_t gtid_34345;
    int32_t gtid_34369;
    __local char *red_arr_mem_38818;
    
    red_arr_mem_38818 = (__local char *) red_arr_mem_38818_backing_0;
    
    __local char *sync_arr_mem_38820;
    
    sync_arr_mem_38820 = (__local char *) sync_arr_mem_38820_backing_1;
    gtid_34344 = squot32(squot32(group_id_34372, squot32(num_groups_34471 +
                                                         smax32(1, sizze_31215 *
                                                                res_31237) - 1,
                                                         smax32(1, sizze_31215 *
                                                                res_31237))),
                         res_31237);
    gtid_34345 = squot32(group_id_34372, squot32(num_groups_34471 + smax32(1,
                                                                           sizze_31215 *
                                                                           res_31237) -
                                                 1, smax32(1, sizze_31215 *
                                                           res_31237))) -
        squot32(squot32(group_id_34372, squot32(num_groups_34471 + smax32(1,
                                                                          sizze_31215 *
                                                                          res_31237) -
                                                1, smax32(1, sizze_31215 *
                                                          res_31237))),
                res_31237) * res_31237;
    
    int32_t chunk_sizze_38822 = smin32(squot32(j_m_i_31370 + group_sizze_34461 *
                                               squot32(num_groups_34471 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237)) - 1,
                                               group_sizze_34461 *
                                               squot32(num_groups_34471 +
                                                       smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                                       smax32(1, sizze_31215 *
                                                              res_31237))),
                                       squot32(j_m_i_31370 -
                                               srem32(global_tid_34370,
                                                      group_sizze_34461 *
                                                      squot32(num_groups_34471 +
                                                              smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237) -
                                                              1, smax32(1,
                                                                        sizze_31215 *
                                                                        res_31237))) +
                                               thread_per_segment_38813 - 1,
                                               thread_per_segment_38813));
    float x_34477;
    float x_34478;
    
    x_34477 = 0.0F;
    for (int32_t i_38826 = 0; i_38826 < chunk_sizze_38822; i_38826++) {
        gtid_34369 = srem32(global_tid_34370, group_sizze_34461 *
                            squot32(num_groups_34471 + smax32(1, sizze_31215 *
                                                              res_31237) - 1,
                                    smax32(1, sizze_31215 * res_31237))) +
            thread_per_segment_38813 * i_38826;
        // apply map function
        {
            int32_t binop_x_37160;
            int32_t binop_x_37161;
            int32_t new_index_37162;
            int32_t binop_y_37168;
            int32_t new_index_37169;
            float x_34483;
            float x_34484;
            float res_34485;
            
            binop_x_37160 = j_m_i_31370 * gtid_34344;
            binop_x_37161 = gtid_34369 + binop_x_37160;
            new_index_37162 = squot32(binop_x_37161, res_31237);
            binop_y_37168 = res_31237 * new_index_37162;
            new_index_37169 = binop_x_37161 - binop_y_37168;
            x_34483 = *(__global float *) &res_mem_38086[(new_index_37162 *
                                                          res_31237 +
                                                          new_index_37169) * 4];
            x_34484 = *(__global float *) &res_mem_38037[(gtid_34344 *
                                                          (j_m_i_31370 *
                                                           res_31237) +
                                                          gtid_34345 *
                                                          j_m_i_31370 +
                                                          gtid_34369) * 4];
            res_34485 = x_34483 * x_34484;
            // save results to be reduced
            {
                x_34478 = res_34485;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_34479 = x_34477 + x_34478;
                
                x_34477 = res_34479;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_38818[local_tid_34371 * 4] = x_34477;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38827;
    int32_t skip_waves_38828;
    float x_38823;
    float x_38824;
    
    offset_38827 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_34371, group_sizze_34461)) {
            x_38823 = *(__local float *) &red_arr_mem_38818[(local_tid_34371 +
                                                             offset_38827) * 4];
        }
    }
    offset_38827 = 1;
    while (slt32(offset_38827, wave_sizze_38809)) {
        if (slt32(local_tid_34371 + offset_38827, group_sizze_34461) &&
            ((local_tid_34371 - squot32(local_tid_34371, wave_sizze_38809) *
              wave_sizze_38809) & (2 * offset_38827 - 1)) == 0) {
            // read array element
            {
                x_38824 = *(volatile __local
                            float *) &red_arr_mem_38818[(local_tid_34371 +
                                                         offset_38827) * 4];
            }
            // apply reduction operation
            {
                float res_38825 = x_38823 + x_38824;
                
                x_38823 = res_38825;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_38818[local_tid_34371 *
                                                               4] = x_38823;
            }
        }
        offset_38827 *= 2;
    }
    skip_waves_38828 = 1;
    while (slt32(skip_waves_38828, squot32(group_sizze_34461 +
                                           wave_sizze_38809 - 1,
                                           wave_sizze_38809))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38827 = skip_waves_38828 * wave_sizze_38809;
        if (slt32(local_tid_34371 + offset_38827, group_sizze_34461) &&
            ((local_tid_34371 - squot32(local_tid_34371, wave_sizze_38809) *
              wave_sizze_38809) == 0 && (squot32(local_tid_34371,
                                                 wave_sizze_38809) & (2 *
                                                                      skip_waves_38828 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_38824 = *(__local
                            float *) &red_arr_mem_38818[(local_tid_34371 +
                                                         offset_38827) * 4];
            }
            // apply reduction operation
            {
                float res_38825 = x_38823 + x_38824;
                
                x_38823 = res_38825;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_38818[local_tid_34371 * 4] =
                    x_38823;
            }
        }
        skip_waves_38828 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_34471 + smax32(1, sizze_31215 * res_31237) - 1,
                smax32(1, sizze_31215 * res_31237)) == 1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_34371 == 0) {
                *(__global float *) &mem_38138[(gtid_34344 * res_31237 +
                                                gtid_34345) * 4] = x_38823;
            }
        }
    } else {
        int32_t old_counter_38829;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_34371 == 0) {
                *(__global float *) &group_res_arr_mem_38814[group_id_34372 *
                                                             4] = x_38823;
                mem_fence_global();
                old_counter_38829 = atomic_add((volatile __global
                                                int *) &counter_mem_38816[srem32(squot32(group_id_34372,
                                                                                         squot32(num_groups_34471 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_38820[0] = old_counter_38829 ==
                    squot32(num_groups_34471 + smax32(1, sizze_31215 *
                                                      res_31237) - 1, smax32(1,
                                                                             sizze_31215 *
                                                                             res_31237)) -
                    1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_38830 = *(__local bool *) &sync_arr_mem_38820[0];
        
        if (is_last_group_38830) {
            if (local_tid_34371 == 0) {
                old_counter_38829 = atomic_add((volatile __global
                                                int *) &counter_mem_38816[srem32(squot32(group_id_34372,
                                                                                         squot32(num_groups_34471 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        res_31237))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_34471 +
                                                           smax32(1,
                                                                  sizze_31215 *
                                                                  res_31237) -
                                                           1, smax32(1,
                                                                     sizze_31215 *
                                                                     res_31237)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_34371, squot32(num_groups_34471 + smax32(1,
                                                                             sizze_31215 *
                                                                             res_31237) -
                                                   1, smax32(1, sizze_31215 *
                                                             res_31237)))) {
                    x_34477 = *(__global
                                float *) &group_res_arr_mem_38814[(squot32(group_id_34372,
                                                                           squot32(num_groups_34471 +
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          res_31237))) *
                                                                   squot32(num_groups_34471 +
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  res_31237)) +
                                                                   local_tid_34371) *
                                                                  4];
                } else {
                    x_34477 = 0.0F;
                }
                *(__local float *) &red_arr_mem_38818[local_tid_34371 * 4] =
                    x_34477;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_38831;
                int32_t skip_waves_38832;
                float x_38823;
                float x_38824;
                
                offset_38831 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_34371, group_sizze_34461)) {
                        x_38823 = *(__local
                                    float *) &red_arr_mem_38818[(local_tid_34371 +
                                                                 offset_38831) *
                                                                4];
                    }
                }
                offset_38831 = 1;
                while (slt32(offset_38831, wave_sizze_38809)) {
                    if (slt32(local_tid_34371 + offset_38831,
                              group_sizze_34461) && ((local_tid_34371 -
                                                      squot32(local_tid_34371,
                                                              wave_sizze_38809) *
                                                      wave_sizze_38809) & (2 *
                                                                           offset_38831 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_38824 = *(volatile __local
                                        float *) &red_arr_mem_38818[(local_tid_34371 +
                                                                     offset_38831) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38825 = x_38823 + x_38824;
                            
                            x_38823 = res_38825;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38818[local_tid_34371 * 4] =
                                x_38823;
                        }
                    }
                    offset_38831 *= 2;
                }
                skip_waves_38832 = 1;
                while (slt32(skip_waves_38832, squot32(group_sizze_34461 +
                                                       wave_sizze_38809 - 1,
                                                       wave_sizze_38809))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_38831 = skip_waves_38832 * wave_sizze_38809;
                    if (slt32(local_tid_34371 + offset_38831,
                              group_sizze_34461) && ((local_tid_34371 -
                                                      squot32(local_tid_34371,
                                                              wave_sizze_38809) *
                                                      wave_sizze_38809) == 0 &&
                                                     (squot32(local_tid_34371,
                                                              wave_sizze_38809) &
                                                      (2 * skip_waves_38832 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_38824 = *(__local
                                        float *) &red_arr_mem_38818[(local_tid_34371 +
                                                                     offset_38831) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38825 = x_38823 + x_38824;
                            
                            x_38823 = res_38825;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_38818[local_tid_34371 * 4] =
                                x_38823;
                        }
                    }
                    skip_waves_38832 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_34371 == 0) {
                        *(__global float *) &mem_38138[(gtid_34344 * res_31237 +
                                                        gtid_34345) * 4] =
                            x_38823;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_34689(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t res_31237, int32_t num_groups_34790,
                                 __global unsigned char *mem_37911, __global
                                 unsigned char *res_mem_38142, __global
                                 unsigned char *mem_38195,
                                 int32_t thread_per_segment_38874, __global
                                 unsigned char *group_res_arr_mem_38875,
                                 __global unsigned char *counter_mem_38877)
{
    const int32_t group_sizze_34780 = mainzigroup_sizze_34671;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38879_backing_0, 4 *
                         mainzigroup_sizze_34671);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_38881_backing_1, 1);
    
    int32_t global_tid_34689;
    int32_t local_tid_34690;
    int32_t group_sizze_38871;
    int32_t wave_sizze_38870;
    int32_t group_id_34691;
    
    global_tid_34689 = get_global_id(0);
    local_tid_34690 = get_local_id(0);
    group_sizze_38871 = get_local_size(0);
    wave_sizze_38870 = LOCKSTEP_WIDTH;
    group_id_34691 = get_group_id(0);
    
    int32_t gtid_34662;
    int32_t gtid_34663;
    int32_t gtid_34688;
    __local char *red_arr_mem_38879;
    
    red_arr_mem_38879 = (__local char *) red_arr_mem_38879_backing_0;
    
    __local char *sync_arr_mem_38881;
    
    sync_arr_mem_38881 = (__local char *) sync_arr_mem_38881_backing_1;
    gtid_34662 = squot32(squot32(group_id_34691, squot32(num_groups_34790 +
                                                         smax32(1, sizze_31215 *
                                                                sizze_31214) -
                                                         1, smax32(1,
                                                                   sizze_31215 *
                                                                   sizze_31214))),
                         sizze_31214);
    gtid_34663 = squot32(group_id_34691, squot32(num_groups_34790 + smax32(1,
                                                                           sizze_31215 *
                                                                           sizze_31214) -
                                                 1, smax32(1, sizze_31215 *
                                                           sizze_31214))) -
        squot32(squot32(group_id_34691, squot32(num_groups_34790 + smax32(1,
                                                                          sizze_31215 *
                                                                          sizze_31214) -
                                                1, smax32(1, sizze_31215 *
                                                          sizze_31214))),
                sizze_31214) * sizze_31214;
    
    int32_t chunk_sizze_38883 = smin32(squot32(res_31237 + group_sizze_34780 *
                                               squot32(num_groups_34790 +
                                                       smax32(1, sizze_31215 *
                                                              sizze_31214) - 1,
                                                       smax32(1, sizze_31215 *
                                                              sizze_31214)) - 1,
                                               group_sizze_34780 *
                                               squot32(num_groups_34790 +
                                                       smax32(1, sizze_31215 *
                                                              sizze_31214) - 1,
                                                       smax32(1, sizze_31215 *
                                                              sizze_31214))),
                                       squot32(res_31237 -
                                               srem32(global_tid_34689,
                                                      group_sizze_34780 *
                                                      squot32(num_groups_34790 +
                                                              smax32(1,
                                                                     sizze_31215 *
                                                                     sizze_31214) -
                                                              1, smax32(1,
                                                                        sizze_31215 *
                                                                        sizze_31214))) +
                                               thread_per_segment_38874 - 1,
                                               thread_per_segment_38874));
    float x_34796;
    float x_34797;
    
    x_34796 = 0.0F;
    for (int32_t i_38887 = 0; i_38887 < chunk_sizze_38883; i_38887++) {
        gtid_34688 = srem32(global_tid_34689, group_sizze_34780 *
                            squot32(num_groups_34790 + smax32(1, sizze_31215 *
                                                              sizze_31214) - 1,
                                    smax32(1, sizze_31215 * sizze_31214))) +
            thread_per_segment_38874 * i_38887;
        // apply map function
        {
            float x_34801;
            float x_34802;
            float res_34803;
            
            x_34801 = *(__global float *) &res_mem_38142[(gtid_34662 *
                                                          res_31237 +
                                                          gtid_34688) * 4];
            x_34802 = *(__global float *) &mem_37911[(gtid_34663 * res_31237 +
                                                      gtid_34688) * 4];
            res_34803 = x_34801 * x_34802;
            // save results to be reduced
            {
                x_34797 = res_34803;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_34798 = x_34796 + x_34797;
                
                x_34796 = res_34798;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_38879[local_tid_34690 * 4] = x_34796;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_38888;
    int32_t skip_waves_38889;
    float x_38884;
    float x_38885;
    
    offset_38888 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_34690, group_sizze_34780)) {
            x_38884 = *(__local float *) &red_arr_mem_38879[(local_tid_34690 +
                                                             offset_38888) * 4];
        }
    }
    offset_38888 = 1;
    while (slt32(offset_38888, wave_sizze_38870)) {
        if (slt32(local_tid_34690 + offset_38888, group_sizze_34780) &&
            ((local_tid_34690 - squot32(local_tid_34690, wave_sizze_38870) *
              wave_sizze_38870) & (2 * offset_38888 - 1)) == 0) {
            // read array element
            {
                x_38885 = *(volatile __local
                            float *) &red_arr_mem_38879[(local_tid_34690 +
                                                         offset_38888) * 4];
            }
            // apply reduction operation
            {
                float res_38886 = x_38884 + x_38885;
                
                x_38884 = res_38886;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_38879[local_tid_34690 *
                                                               4] = x_38884;
            }
        }
        offset_38888 *= 2;
    }
    skip_waves_38889 = 1;
    while (slt32(skip_waves_38889, squot32(group_sizze_34780 +
                                           wave_sizze_38870 - 1,
                                           wave_sizze_38870))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_38888 = skip_waves_38889 * wave_sizze_38870;
        if (slt32(local_tid_34690 + offset_38888, group_sizze_34780) &&
            ((local_tid_34690 - squot32(local_tid_34690, wave_sizze_38870) *
              wave_sizze_38870) == 0 && (squot32(local_tid_34690,
                                                 wave_sizze_38870) & (2 *
                                                                      skip_waves_38889 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_38885 = *(__local
                            float *) &red_arr_mem_38879[(local_tid_34690 +
                                                         offset_38888) * 4];
            }
            // apply reduction operation
            {
                float res_38886 = x_38884 + x_38885;
                
                x_38884 = res_38886;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_38879[local_tid_34690 * 4] =
                    x_38884;
            }
        }
        skip_waves_38889 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_34790 + smax32(1, sizze_31215 * sizze_31214) - 1,
                smax32(1, sizze_31215 * sizze_31214)) == 1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_34690 == 0) {
                *(__global float *) &mem_38195[(gtid_34662 * sizze_31214 +
                                                gtid_34663) * 4] = x_38884;
            }
        }
    } else {
        int32_t old_counter_38890;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_34690 == 0) {
                *(__global float *) &group_res_arr_mem_38875[group_id_34691 *
                                                             4] = x_38884;
                mem_fence_global();
                old_counter_38890 = atomic_add((volatile __global
                                                int *) &counter_mem_38877[srem32(squot32(group_id_34691,
                                                                                         squot32(num_groups_34790 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        sizze_31214) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        sizze_31214))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_38881[0] = old_counter_38890 ==
                    squot32(num_groups_34790 + smax32(1, sizze_31215 *
                                                      sizze_31214) - 1,
                            smax32(1, sizze_31215 * sizze_31214)) - 1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_38891 = *(__local bool *) &sync_arr_mem_38881[0];
        
        if (is_last_group_38891) {
            if (local_tid_34690 == 0) {
                old_counter_38890 = atomic_add((volatile __global
                                                int *) &counter_mem_38877[srem32(squot32(group_id_34691,
                                                                                         squot32(num_groups_34790 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        sizze_31214) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215 *
                                                                                                        sizze_31214))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_34790 +
                                                           smax32(1,
                                                                  sizze_31215 *
                                                                  sizze_31214) -
                                                           1, smax32(1,
                                                                     sizze_31215 *
                                                                     sizze_31214)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_34690, squot32(num_groups_34790 + smax32(1,
                                                                             sizze_31215 *
                                                                             sizze_31214) -
                                                   1, smax32(1, sizze_31215 *
                                                             sizze_31214)))) {
                    x_34796 = *(__global
                                float *) &group_res_arr_mem_38875[(squot32(group_id_34691,
                                                                           squot32(num_groups_34790 +
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          sizze_31214) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215 *
                                                                                          sizze_31214))) *
                                                                   squot32(num_groups_34790 +
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  sizze_31214) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215 *
                                                                                  sizze_31214)) +
                                                                   local_tid_34690) *
                                                                  4];
                } else {
                    x_34796 = 0.0F;
                }
                *(__local float *) &red_arr_mem_38879[local_tid_34690 * 4] =
                    x_34796;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_38892;
                int32_t skip_waves_38893;
                float x_38884;
                float x_38885;
                
                offset_38892 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_34690, group_sizze_34780)) {
                        x_38884 = *(__local
                                    float *) &red_arr_mem_38879[(local_tid_34690 +
                                                                 offset_38892) *
                                                                4];
                    }
                }
                offset_38892 = 1;
                while (slt32(offset_38892, wave_sizze_38870)) {
                    if (slt32(local_tid_34690 + offset_38892,
                              group_sizze_34780) && ((local_tid_34690 -
                                                      squot32(local_tid_34690,
                                                              wave_sizze_38870) *
                                                      wave_sizze_38870) & (2 *
                                                                           offset_38892 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_38885 = *(volatile __local
                                        float *) &red_arr_mem_38879[(local_tid_34690 +
                                                                     offset_38892) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38886 = x_38884 + x_38885;
                            
                            x_38884 = res_38886;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38879[local_tid_34690 * 4] =
                                x_38884;
                        }
                    }
                    offset_38892 *= 2;
                }
                skip_waves_38893 = 1;
                while (slt32(skip_waves_38893, squot32(group_sizze_34780 +
                                                       wave_sizze_38870 - 1,
                                                       wave_sizze_38870))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_38892 = skip_waves_38893 * wave_sizze_38870;
                    if (slt32(local_tid_34690 + offset_38892,
                              group_sizze_34780) && ((local_tid_34690 -
                                                      squot32(local_tid_34690,
                                                              wave_sizze_38870) *
                                                      wave_sizze_38870) == 0 &&
                                                     (squot32(local_tid_34690,
                                                              wave_sizze_38870) &
                                                      (2 * skip_waves_38893 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_38885 = *(__local
                                        float *) &red_arr_mem_38879[(local_tid_34690 +
                                                                     offset_38892) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_38886 = x_38884 + x_38885;
                            
                            x_38884 = res_38886;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_38879[local_tid_34690 * 4] =
                                x_38884;
                        }
                    }
                    skip_waves_38893 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_34690 == 0) {
                        *(__global float *) &mem_38195[(gtid_34662 *
                                                        sizze_31214 +
                                                        gtid_34663) * 4] =
                            x_38884;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_35467(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t n_31219, int32_t num_groups_35536,
                                 __global unsigned char *res_mem_38290, __global
                                 unsigned char *mem_38326, __global
                                 unsigned char *mem_38329,
                                 int32_t thread_per_segment_39052, __global
                                 unsigned char *group_res_arr_mem_39053,
                                 __global unsigned char *counter_mem_39055)
{
    const int32_t group_sizze_35526 = mainzigroup_sizze_35449;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39057_backing_0, 4 *
                         mainzigroup_sizze_35449);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_39059_backing_1, 1);
    
    int32_t global_tid_35467;
    int32_t local_tid_35468;
    int32_t group_sizze_39049;
    int32_t wave_sizze_39048;
    int32_t group_id_35469;
    
    global_tid_35467 = get_global_id(0);
    local_tid_35468 = get_local_id(0);
    group_sizze_39049 = get_local_size(0);
    wave_sizze_39048 = LOCKSTEP_WIDTH;
    group_id_35469 = get_group_id(0);
    
    int32_t gtid_35444;
    int32_t gtid_35466;
    __local char *red_arr_mem_39057;
    
    red_arr_mem_39057 = (__local char *) red_arr_mem_39057_backing_0;
    
    __local char *sync_arr_mem_39059;
    
    sync_arr_mem_39059 = (__local char *) sync_arr_mem_39059_backing_1;
    gtid_35444 = squot32(group_id_35469, squot32(num_groups_35536 + smax32(1,
                                                                           sizze_31215) -
                                                 1, smax32(1, sizze_31215)));
    
    int32_t chunk_sizze_39061 = smin32(squot32(n_31219 + group_sizze_35526 *
                                               squot32(num_groups_35536 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215)) -
                                               1, group_sizze_35526 *
                                               squot32(num_groups_35536 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215))),
                                       squot32(n_31219 -
                                               srem32(global_tid_35467,
                                                      group_sizze_35526 *
                                                      squot32(num_groups_35536 +
                                                              smax32(1,
                                                                     sizze_31215) -
                                                              1, smax32(1,
                                                                        sizze_31215))) +
                                               thread_per_segment_39052 - 1,
                                               thread_per_segment_39052));
    float x_35542;
    float x_35543;
    
    x_35542 = 0.0F;
    for (int32_t i_39065 = 0; i_39065 < chunk_sizze_39061; i_39065++) {
        gtid_35466 = srem32(global_tid_35467, group_sizze_35526 *
                            squot32(num_groups_35536 + smax32(1, sizze_31215) -
                                    1, smax32(1, sizze_31215))) +
            thread_per_segment_39052 * i_39065;
        // apply map function
        {
            int32_t res_35546;
            bool cond_35548;
            float res_35549;
            float res_35551;
            
            res_35546 = *(__global int32_t *) &mem_38326[gtid_35444 * 4];
            cond_35548 = slt32(gtid_35466, res_35546);
            if (cond_35548) {
                float res_35550 = *(__global
                                    float *) &res_mem_38290[(gtid_35444 *
                                                             sizze_31214 +
                                                             gtid_35466) * 4];
                
                res_35549 = res_35550;
            } else {
                res_35549 = 0.0F;
            }
            res_35551 = res_35549 * res_35549;
            // save results to be reduced
            {
                x_35543 = res_35551;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_35544 = x_35542 + x_35543;
                
                x_35542 = res_35544;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_39057[local_tid_35468 * 4] = x_35542;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39066;
    int32_t skip_waves_39067;
    float x_39062;
    float x_39063;
    
    offset_39066 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35468, group_sizze_35526)) {
            x_39062 = *(__local float *) &red_arr_mem_39057[(local_tid_35468 +
                                                             offset_39066) * 4];
        }
    }
    offset_39066 = 1;
    while (slt32(offset_39066, wave_sizze_39048)) {
        if (slt32(local_tid_35468 + offset_39066, group_sizze_35526) &&
            ((local_tid_35468 - squot32(local_tid_35468, wave_sizze_39048) *
              wave_sizze_39048) & (2 * offset_39066 - 1)) == 0) {
            // read array element
            {
                x_39063 = *(volatile __local
                            float *) &red_arr_mem_39057[(local_tid_35468 +
                                                         offset_39066) * 4];
            }
            // apply reduction operation
            {
                float res_39064 = x_39062 + x_39063;
                
                x_39062 = res_39064;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_39057[local_tid_35468 *
                                                               4] = x_39062;
            }
        }
        offset_39066 *= 2;
    }
    skip_waves_39067 = 1;
    while (slt32(skip_waves_39067, squot32(group_sizze_35526 +
                                           wave_sizze_39048 - 1,
                                           wave_sizze_39048))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39066 = skip_waves_39067 * wave_sizze_39048;
        if (slt32(local_tid_35468 + offset_39066, group_sizze_35526) &&
            ((local_tid_35468 - squot32(local_tid_35468, wave_sizze_39048) *
              wave_sizze_39048) == 0 && (squot32(local_tid_35468,
                                                 wave_sizze_39048) & (2 *
                                                                      skip_waves_39067 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_39063 = *(__local
                            float *) &red_arr_mem_39057[(local_tid_35468 +
                                                         offset_39066) * 4];
            }
            // apply reduction operation
            {
                float res_39064 = x_39062 + x_39063;
                
                x_39062 = res_39064;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_39057[local_tid_35468 * 4] =
                    x_39062;
            }
        }
        skip_waves_39067 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_35536 + smax32(1, sizze_31215) - 1, smax32(1,
                                                                      sizze_31215)) ==
        1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_35468 == 0) {
                *(__global float *) &mem_38329[gtid_35444 * 4] = x_39062;
            }
        }
    } else {
        int32_t old_counter_39068;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_35468 == 0) {
                *(__global float *) &group_res_arr_mem_39053[group_id_35469 *
                                                             4] = x_39062;
                mem_fence_global();
                old_counter_39068 = atomic_add((volatile __global
                                                int *) &counter_mem_39055[srem32(squot32(group_id_35469,
                                                                                         squot32(num_groups_35536 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_39059[0] = old_counter_39068 ==
                    squot32(num_groups_35536 + smax32(1, sizze_31215) - 1,
                            smax32(1, sizze_31215)) - 1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_39069 = *(__local bool *) &sync_arr_mem_39059[0];
        
        if (is_last_group_39069) {
            if (local_tid_35468 == 0) {
                old_counter_39068 = atomic_add((volatile __global
                                                int *) &counter_mem_39055[srem32(squot32(group_id_35469,
                                                                                         squot32(num_groups_35536 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_35536 +
                                                           smax32(1,
                                                                  sizze_31215) -
                                                           1, smax32(1,
                                                                     sizze_31215)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_35468, squot32(num_groups_35536 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_35542 = *(__global
                                float *) &group_res_arr_mem_39053[(squot32(group_id_35469,
                                                                           squot32(num_groups_35536 +
                                                                                   smax32(1,
                                                                                          sizze_31215) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215))) *
                                                                   squot32(num_groups_35536 +
                                                                           smax32(1,
                                                                                  sizze_31215) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215)) +
                                                                   local_tid_35468) *
                                                                  4];
                } else {
                    x_35542 = 0.0F;
                }
                *(__local float *) &red_arr_mem_39057[local_tid_35468 * 4] =
                    x_35542;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_39070;
                int32_t skip_waves_39071;
                float x_39062;
                float x_39063;
                
                offset_39070 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_35468, group_sizze_35526)) {
                        x_39062 = *(__local
                                    float *) &red_arr_mem_39057[(local_tid_35468 +
                                                                 offset_39070) *
                                                                4];
                    }
                }
                offset_39070 = 1;
                while (slt32(offset_39070, wave_sizze_39048)) {
                    if (slt32(local_tid_35468 + offset_39070,
                              group_sizze_35526) && ((local_tid_35468 -
                                                      squot32(local_tid_35468,
                                                              wave_sizze_39048) *
                                                      wave_sizze_39048) & (2 *
                                                                           offset_39070 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_39063 = *(volatile __local
                                        float *) &red_arr_mem_39057[(local_tid_35468 +
                                                                     offset_39070) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_39064 = x_39062 + x_39063;
                            
                            x_39062 = res_39064;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_39057[local_tid_35468 * 4] =
                                x_39062;
                        }
                    }
                    offset_39070 *= 2;
                }
                skip_waves_39071 = 1;
                while (slt32(skip_waves_39071, squot32(group_sizze_35526 +
                                                       wave_sizze_39048 - 1,
                                                       wave_sizze_39048))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_39070 = skip_waves_39071 * wave_sizze_39048;
                    if (slt32(local_tid_35468 + offset_39070,
                              group_sizze_35526) && ((local_tid_35468 -
                                                      squot32(local_tid_35468,
                                                              wave_sizze_39048) *
                                                      wave_sizze_39048) == 0 &&
                                                     (squot32(local_tid_35468,
                                                              wave_sizze_39048) &
                                                      (2 * skip_waves_39071 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_39063 = *(__local
                                        float *) &red_arr_mem_39057[(local_tid_35468 +
                                                                     offset_39070) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_39064 = x_39062 + x_39063;
                            
                            x_39062 = res_39064;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_39057[local_tid_35468 * 4] =
                                x_39062;
                        }
                    }
                    skip_waves_39071 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_35468 == 0) {
                        *(__global float *) &mem_38329[gtid_35444 * 4] =
                            x_39062;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_35492(int32_t sizze_31215, int32_t sizze_31216,
                                 int32_t n_31219, int32_t num_groups_35508,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *mem_38326,
                                 int32_t thread_per_segment_39017, __global
                                 unsigned char *group_res_arr_mem_39018,
                                 __global unsigned char *counter_mem_39020)
{
    const int32_t group_sizze_35498 = mainzigroup_sizze_35474;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39022_backing_0, 4 *
                         mainzigroup_sizze_35474);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_39024_backing_1, 1);
    
    int32_t global_tid_35492;
    int32_t local_tid_35493;
    int32_t group_sizze_39014;
    int32_t wave_sizze_39013;
    int32_t group_id_35494;
    
    global_tid_35492 = get_global_id(0);
    local_tid_35493 = get_local_id(0);
    group_sizze_39014 = get_local_size(0);
    wave_sizze_39013 = LOCKSTEP_WIDTH;
    group_id_35494 = get_group_id(0);
    
    int32_t gtid_35470;
    int32_t gtid_35491;
    __local char *red_arr_mem_39022;
    
    red_arr_mem_39022 = (__local char *) red_arr_mem_39022_backing_0;
    
    __local char *sync_arr_mem_39024;
    
    sync_arr_mem_39024 = (__local char *) sync_arr_mem_39024_backing_1;
    gtid_35470 = squot32(group_id_35494, squot32(num_groups_35508 + smax32(1,
                                                                           sizze_31215) -
                                                 1, smax32(1, sizze_31215)));
    
    int32_t chunk_sizze_39026 = smin32(squot32(n_31219 + group_sizze_35498 *
                                               squot32(num_groups_35508 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215)) -
                                               1, group_sizze_35498 *
                                               squot32(num_groups_35508 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215))),
                                       squot32(n_31219 -
                                               srem32(global_tid_35492,
                                                      group_sizze_35498 *
                                                      squot32(num_groups_35508 +
                                                              smax32(1,
                                                                     sizze_31215) -
                                                              1, smax32(1,
                                                                        sizze_31215))) +
                                               thread_per_segment_39017 - 1,
                                               thread_per_segment_39017));
    int32_t x_35514;
    int32_t x_35515;
    
    x_35514 = 0;
    for (int32_t i_39030 = 0; i_39030 < chunk_sizze_39026; i_39030++) {
        gtid_35491 = srem32(global_tid_35492, group_sizze_35498 *
                            squot32(num_groups_35508 + smax32(1, sizze_31215) -
                                    1, smax32(1, sizze_31215))) +
            thread_per_segment_39017 * i_39030;
        // apply map function
        {
            float x_35518;
            bool res_35519;
            bool cond_35520;
            int32_t res_35521;
            
            x_35518 = *(__global float *) &images_mem_37894[(gtid_35470 *
                                                             sizze_31216 +
                                                             gtid_35491) * 4];
            res_35519 = futrts_isnan32(x_35518);
            cond_35520 = !res_35519;
            if (cond_35520) {
                res_35521 = 1;
            } else {
                res_35521 = 0;
            }
            // save results to be reduced
            {
                x_35515 = res_35521;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                int32_t res_35516 = x_35514 + x_35515;
                
                x_35514 = res_35516;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local int32_t *) &red_arr_mem_39022[local_tid_35493 * 4] = x_35514;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39031;
    int32_t skip_waves_39032;
    int32_t x_39027;
    int32_t x_39028;
    
    offset_39031 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35493, group_sizze_35498)) {
            x_39027 = *(__local int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                               offset_39031) *
                                                              4];
        }
    }
    offset_39031 = 1;
    while (slt32(offset_39031, wave_sizze_39013)) {
        if (slt32(local_tid_35493 + offset_39031, group_sizze_35498) &&
            ((local_tid_35493 - squot32(local_tid_35493, wave_sizze_39013) *
              wave_sizze_39013) & (2 * offset_39031 - 1)) == 0) {
            // read array element
            {
                x_39028 = *(volatile __local
                            int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                           offset_39031) * 4];
            }
            // apply reduction operation
            {
                int32_t res_39029 = x_39027 + x_39028;
                
                x_39027 = res_39029;
            }
            // write result of operation
            {
                *(volatile __local
                  int32_t *) &red_arr_mem_39022[local_tid_35493 * 4] = x_39027;
            }
        }
        offset_39031 *= 2;
    }
    skip_waves_39032 = 1;
    while (slt32(skip_waves_39032, squot32(group_sizze_35498 +
                                           wave_sizze_39013 - 1,
                                           wave_sizze_39013))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39031 = skip_waves_39032 * wave_sizze_39013;
        if (slt32(local_tid_35493 + offset_39031, group_sizze_35498) &&
            ((local_tid_35493 - squot32(local_tid_35493, wave_sizze_39013) *
              wave_sizze_39013) == 0 && (squot32(local_tid_35493,
                                                 wave_sizze_39013) & (2 *
                                                                      skip_waves_39032 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_39028 = *(__local
                            int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                           offset_39031) * 4];
            }
            // apply reduction operation
            {
                int32_t res_39029 = x_39027 + x_39028;
                
                x_39027 = res_39029;
            }
            // write result of operation
            {
                *(__local int32_t *) &red_arr_mem_39022[local_tid_35493 * 4] =
                    x_39027;
            }
        }
        skip_waves_39032 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_35508 + smax32(1, sizze_31215) - 1, smax32(1,
                                                                      sizze_31215)) ==
        1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_35493 == 0) {
                *(__global int32_t *) &mem_38326[gtid_35470 * 4] = x_39027;
            }
        }
    } else {
        int32_t old_counter_39033;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_35493 == 0) {
                *(__global int32_t *) &group_res_arr_mem_39018[group_id_35494 *
                                                               4] = x_39027;
                mem_fence_global();
                old_counter_39033 = atomic_add((volatile __global
                                                int *) &counter_mem_39020[srem32(squot32(group_id_35494,
                                                                                         squot32(num_groups_35508 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_39024[0] = old_counter_39033 ==
                    squot32(num_groups_35508 + smax32(1, sizze_31215) - 1,
                            smax32(1, sizze_31215)) - 1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_39034 = *(__local bool *) &sync_arr_mem_39024[0];
        
        if (is_last_group_39034) {
            if (local_tid_35493 == 0) {
                old_counter_39033 = atomic_add((volatile __global
                                                int *) &counter_mem_39020[srem32(squot32(group_id_35494,
                                                                                         squot32(num_groups_35508 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_35508 +
                                                           smax32(1,
                                                                  sizze_31215) -
                                                           1, smax32(1,
                                                                     sizze_31215)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_35493, squot32(num_groups_35508 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_35514 = *(__global
                                int32_t *) &group_res_arr_mem_39018[(squot32(group_id_35494,
                                                                             squot32(num_groups_35508 +
                                                                                     smax32(1,
                                                                                            sizze_31215) -
                                                                                     1,
                                                                                     smax32(1,
                                                                                            sizze_31215))) *
                                                                     squot32(num_groups_35508 +
                                                                             smax32(1,
                                                                                    sizze_31215) -
                                                                             1,
                                                                             smax32(1,
                                                                                    sizze_31215)) +
                                                                     local_tid_35493) *
                                                                    4];
                } else {
                    x_35514 = 0;
                }
                *(__local int32_t *) &red_arr_mem_39022[local_tid_35493 * 4] =
                    x_35514;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_39035;
                int32_t skip_waves_39036;
                int32_t x_39027;
                int32_t x_39028;
                
                offset_39035 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_35493, group_sizze_35498)) {
                        x_39027 = *(__local
                                    int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                                   offset_39035) *
                                                                  4];
                    }
                }
                offset_39035 = 1;
                while (slt32(offset_39035, wave_sizze_39013)) {
                    if (slt32(local_tid_35493 + offset_39035,
                              group_sizze_35498) && ((local_tid_35493 -
                                                      squot32(local_tid_35493,
                                                              wave_sizze_39013) *
                                                      wave_sizze_39013) & (2 *
                                                                           offset_39035 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_39028 = *(volatile __local
                                        int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                                       offset_39035) *
                                                                      4];
                        }
                        // apply reduction operation
                        {
                            int32_t res_39029 = x_39027 + x_39028;
                            
                            x_39027 = res_39029;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              int32_t *) &red_arr_mem_39022[local_tid_35493 *
                                                            4] = x_39027;
                        }
                    }
                    offset_39035 *= 2;
                }
                skip_waves_39036 = 1;
                while (slt32(skip_waves_39036, squot32(group_sizze_35498 +
                                                       wave_sizze_39013 - 1,
                                                       wave_sizze_39013))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_39035 = skip_waves_39036 * wave_sizze_39013;
                    if (slt32(local_tid_35493 + offset_39035,
                              group_sizze_35498) && ((local_tid_35493 -
                                                      squot32(local_tid_35493,
                                                              wave_sizze_39013) *
                                                      wave_sizze_39013) == 0 &&
                                                     (squot32(local_tid_35493,
                                                              wave_sizze_39013) &
                                                      (2 * skip_waves_39036 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_39028 = *(__local
                                        int32_t *) &red_arr_mem_39022[(local_tid_35493 +
                                                                       offset_39035) *
                                                                      4];
                        }
                        // apply reduction operation
                        {
                            int32_t res_39029 = x_39027 + x_39028;
                            
                            x_39027 = res_39029;
                        }
                        // write result of operation
                        {
                            *(__local
                              int32_t *) &red_arr_mem_39022[local_tid_35493 *
                                                            4] = x_39027;
                        }
                    }
                    skip_waves_39036 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_35493 == 0) {
                        *(__global int32_t *) &mem_38326[gtid_35470 * 4] =
                            x_39027;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_35723(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t res_31594, int32_t num_groups_35740,
                                 __global unsigned char *res_mem_38290, __global
                                 unsigned char *res_mem_38339, __global
                                 unsigned char *res_mem_38340, __global
                                 unsigned char *mem_38356,
                                 int32_t thread_per_segment_39121, __global
                                 unsigned char *group_res_arr_mem_39122,
                                 __global unsigned char *counter_mem_39124)
{
    const int32_t group_sizze_35730 = mainzigroup_sizze_35705;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39126_backing_0, 4 *
                         mainzigroup_sizze_35705);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_39128_backing_1, 1);
    
    int32_t global_tid_35723;
    int32_t local_tid_35724;
    int32_t group_sizze_39118;
    int32_t wave_sizze_39117;
    int32_t group_id_35725;
    
    global_tid_35723 = get_global_id(0);
    local_tid_35724 = get_local_id(0);
    group_sizze_39118 = get_local_size(0);
    wave_sizze_39117 = LOCKSTEP_WIDTH;
    group_id_35725 = get_group_id(0);
    
    int32_t gtid_35700;
    int32_t gtid_35722;
    __local char *red_arr_mem_39126;
    
    red_arr_mem_39126 = (__local char *) red_arr_mem_39126_backing_0;
    
    __local char *sync_arr_mem_39128;
    
    sync_arr_mem_39128 = (__local char *) sync_arr_mem_39128_backing_1;
    gtid_35700 = squot32(group_id_35725, squot32(num_groups_35740 + smax32(1,
                                                                           sizze_31215) -
                                                 1, smax32(1, sizze_31215)));
    
    int32_t chunk_sizze_39130 = smin32(squot32(res_31594 + group_sizze_35730 *
                                               squot32(num_groups_35740 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215)) -
                                               1, group_sizze_35730 *
                                               squot32(num_groups_35740 +
                                                       smax32(1, sizze_31215) -
                                                       1, smax32(1,
                                                                 sizze_31215))),
                                       squot32(res_31594 -
                                               srem32(global_tid_35723,
                                                      group_sizze_35730 *
                                                      squot32(num_groups_35740 +
                                                              smax32(1,
                                                                     sizze_31215) -
                                                              1, smax32(1,
                                                                        sizze_31215))) +
                                               thread_per_segment_39121 - 1,
                                               thread_per_segment_39121));
    float x_35746;
    float x_35747;
    
    x_35746 = 0.0F;
    for (int32_t i_39134 = 0; i_39134 < chunk_sizze_39130; i_39134++) {
        gtid_35722 = srem32(global_tid_35723, group_sizze_35730 *
                            squot32(num_groups_35740 + smax32(1, sizze_31215) -
                                    1, smax32(1, sizze_31215))) +
            thread_per_segment_39121 * i_39134;
        // apply map function
        {
            int32_t x_35750;
            int32_t x_35751;
            bool cond_35753;
            float res_35754;
            
            x_35750 = *(__global int32_t *) &res_mem_38340[gtid_35700 * 4];
            x_35751 = *(__global int32_t *) &res_mem_38339[gtid_35700 * 4];
            cond_35753 = slt32(gtid_35722, x_35751);
            if (cond_35753) {
                int32_t x_35755;
                int32_t x_35756;
                int32_t i_35757;
                float res_35758;
                
                x_35755 = gtid_35722 + x_35750;
                x_35756 = x_35755 - x_35751;
                i_35757 = 1 + x_35756;
                res_35758 = *(__global float *) &res_mem_38290[(gtid_35700 *
                                                                sizze_31214 +
                                                                i_35757) * 4];
                res_35754 = res_35758;
            } else {
                res_35754 = 0.0F;
            }
            // save results to be reduced
            {
                x_35747 = res_35754;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                float res_35748 = x_35746 + x_35747;
                
                x_35746 = res_35748;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local float *) &red_arr_mem_39126[local_tid_35724 * 4] = x_35746;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39135;
    int32_t skip_waves_39136;
    float x_39131;
    float x_39132;
    
    offset_39135 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35724, group_sizze_35730)) {
            x_39131 = *(__local float *) &red_arr_mem_39126[(local_tid_35724 +
                                                             offset_39135) * 4];
        }
    }
    offset_39135 = 1;
    while (slt32(offset_39135, wave_sizze_39117)) {
        if (slt32(local_tid_35724 + offset_39135, group_sizze_35730) &&
            ((local_tid_35724 - squot32(local_tid_35724, wave_sizze_39117) *
              wave_sizze_39117) & (2 * offset_39135 - 1)) == 0) {
            // read array element
            {
                x_39132 = *(volatile __local
                            float *) &red_arr_mem_39126[(local_tid_35724 +
                                                         offset_39135) * 4];
            }
            // apply reduction operation
            {
                float res_39133 = x_39131 + x_39132;
                
                x_39131 = res_39133;
            }
            // write result of operation
            {
                *(volatile __local float *) &red_arr_mem_39126[local_tid_35724 *
                                                               4] = x_39131;
            }
        }
        offset_39135 *= 2;
    }
    skip_waves_39136 = 1;
    while (slt32(skip_waves_39136, squot32(group_sizze_35730 +
                                           wave_sizze_39117 - 1,
                                           wave_sizze_39117))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39135 = skip_waves_39136 * wave_sizze_39117;
        if (slt32(local_tid_35724 + offset_39135, group_sizze_35730) &&
            ((local_tid_35724 - squot32(local_tid_35724, wave_sizze_39117) *
              wave_sizze_39117) == 0 && (squot32(local_tid_35724,
                                                 wave_sizze_39117) & (2 *
                                                                      skip_waves_39136 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_39132 = *(__local
                            float *) &red_arr_mem_39126[(local_tid_35724 +
                                                         offset_39135) * 4];
            }
            // apply reduction operation
            {
                float res_39133 = x_39131 + x_39132;
                
                x_39131 = res_39133;
            }
            // write result of operation
            {
                *(__local float *) &red_arr_mem_39126[local_tid_35724 * 4] =
                    x_39131;
            }
        }
        skip_waves_39136 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (squot32(num_groups_35740 + smax32(1, sizze_31215) - 1, smax32(1,
                                                                      sizze_31215)) ==
        1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_35724 == 0) {
                *(__global float *) &mem_38356[gtid_35700 * 4] = x_39131;
            }
        }
    } else {
        int32_t old_counter_39137;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_35724 == 0) {
                *(__global float *) &group_res_arr_mem_39122[group_id_35725 *
                                                             4] = x_39131;
                mem_fence_global();
                old_counter_39137 = atomic_add((volatile __global
                                                int *) &counter_mem_39124[srem32(squot32(group_id_35725,
                                                                                         squot32(num_groups_35740 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_39128[0] = old_counter_39137 ==
                    squot32(num_groups_35740 + smax32(1, sizze_31215) - 1,
                            smax32(1, sizze_31215)) - 1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_39138 = *(__local bool *) &sync_arr_mem_39128[0];
        
        if (is_last_group_39138) {
            if (local_tid_35724 == 0) {
                old_counter_39137 = atomic_add((volatile __global
                                                int *) &counter_mem_39124[srem32(squot32(group_id_35725,
                                                                                         squot32(num_groups_35740 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_35740 +
                                                           smax32(1,
                                                                  sizze_31215) -
                                                           1, smax32(1,
                                                                     sizze_31215)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_35724, squot32(num_groups_35740 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_35746 = *(__global
                                float *) &group_res_arr_mem_39122[(squot32(group_id_35725,
                                                                           squot32(num_groups_35740 +
                                                                                   smax32(1,
                                                                                          sizze_31215) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215))) *
                                                                   squot32(num_groups_35740 +
                                                                           smax32(1,
                                                                                  sizze_31215) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215)) +
                                                                   local_tid_35724) *
                                                                  4];
                } else {
                    x_35746 = 0.0F;
                }
                *(__local float *) &red_arr_mem_39126[local_tid_35724 * 4] =
                    x_35746;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_39139;
                int32_t skip_waves_39140;
                float x_39131;
                float x_39132;
                
                offset_39139 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_35724, group_sizze_35730)) {
                        x_39131 = *(__local
                                    float *) &red_arr_mem_39126[(local_tid_35724 +
                                                                 offset_39139) *
                                                                4];
                    }
                }
                offset_39139 = 1;
                while (slt32(offset_39139, wave_sizze_39117)) {
                    if (slt32(local_tid_35724 + offset_39139,
                              group_sizze_35730) && ((local_tid_35724 -
                                                      squot32(local_tid_35724,
                                                              wave_sizze_39117) *
                                                      wave_sizze_39117) & (2 *
                                                                           offset_39139 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_39132 = *(volatile __local
                                        float *) &red_arr_mem_39126[(local_tid_35724 +
                                                                     offset_39139) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_39133 = x_39131 + x_39132;
                            
                            x_39131 = res_39133;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              float *) &red_arr_mem_39126[local_tid_35724 * 4] =
                                x_39131;
                        }
                    }
                    offset_39139 *= 2;
                }
                skip_waves_39140 = 1;
                while (slt32(skip_waves_39140, squot32(group_sizze_35730 +
                                                       wave_sizze_39117 - 1,
                                                       wave_sizze_39117))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_39139 = skip_waves_39140 * wave_sizze_39117;
                    if (slt32(local_tid_35724 + offset_39139,
                              group_sizze_35730) && ((local_tid_35724 -
                                                      squot32(local_tid_35724,
                                                              wave_sizze_39117) *
                                                      wave_sizze_39117) == 0 &&
                                                     (squot32(local_tid_35724,
                                                              wave_sizze_39117) &
                                                      (2 * skip_waves_39140 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_39132 = *(__local
                                        float *) &red_arr_mem_39126[(local_tid_35724 +
                                                                     offset_39139) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            float res_39133 = x_39131 + x_39132;
                            
                            x_39131 = res_39133;
                        }
                        // write result of operation
                        {
                            *(__local
                              float *) &red_arr_mem_39126[local_tid_35724 * 4] =
                                x_39131;
                        }
                    }
                    skip_waves_39140 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_35724 == 0) {
                        *(__global float *) &mem_38356[gtid_35700 * 4] =
                            x_39131;
                    }
                }
            }
        }
    }
}
__kernel void segred_large_36423(int32_t sizze_31215, int32_t arg_31616,
                                 int32_t num_groups_36592, __global
                                 unsigned char *mem_38361, __global
                                 unsigned char *mem_38434, __global
                                 unsigned char *mem_38437, __global
                                 unsigned char *mem_38441, __global
                                 unsigned char *mem_38443, __global
                                 unsigned char *mem_38446, __global
                                 unsigned char *mem_38449, __global
                                 unsigned char *mem_38453, __global
                                 unsigned char *group_res_arr_mem_39253,
                                 __global
                                 unsigned char *group_res_arr_mem_39255,
                                 __global
                                 unsigned char *group_res_arr_mem_39257,
                                 __global unsigned char *counter_mem_39259)
{
    const int32_t group_sizze_36582 = mainzigroup_sizze_36405;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39261_backing_0, mainzigroup_sizze_36405);
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39263_backing_1, 4 *
                         mainzigroup_sizze_36405);
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39265_backing_2, 4 *
                         mainzigroup_sizze_36405);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_39267_backing_3, 1);
    
    int32_t global_tid_36423;
    int32_t local_tid_36424;
    int32_t group_sizze_39249;
    int32_t wave_sizze_39248;
    int32_t group_id_36425;
    
    global_tid_36423 = get_global_id(0);
    local_tid_36424 = get_local_id(0);
    group_sizze_39249 = get_local_size(0);
    wave_sizze_39248 = LOCKSTEP_WIDTH;
    group_id_36425 = get_group_id(0);
    
    int32_t gtid_36399;
    int32_t gtid_36422;
    __local char *red_arr_mem_39261;
    
    red_arr_mem_39261 = (__local char *) red_arr_mem_39261_backing_0;
    
    __local char *red_arr_mem_39263;
    
    red_arr_mem_39263 = (__local char *) red_arr_mem_39263_backing_1;
    
    __local char *red_arr_mem_39265;
    
    red_arr_mem_39265 = (__local char *) red_arr_mem_39265_backing_2;
    
    __local char *sync_arr_mem_39267;
    
    sync_arr_mem_39267 = (__local char *) sync_arr_mem_39267_backing_3;
    gtid_36399 = squot32(group_id_36425, squot32(num_groups_36592 + smax32(1,
                                                                           sizze_31215) -
                                                 1, smax32(1, sizze_31215)));
    
    int32_t chunk_sizze_39269;
    int32_t starting_point_39270 = srem32(global_tid_36423, group_sizze_36582 *
                                          squot32(num_groups_36592 + smax32(1,
                                                                            sizze_31215) -
                                                  1, smax32(1, sizze_31215))) *
            squot32(arg_31616 + group_sizze_36582 * squot32(num_groups_36592 +
                                                            smax32(1,
                                                                   sizze_31215) -
                                                            1, smax32(1,
                                                                      sizze_31215)) -
                    1, group_sizze_36582 * squot32(num_groups_36592 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1, sizze_31215)));
    int32_t remaining_elements_39271 = arg_31616 - starting_point_39270;
    
    if (sle32(remaining_elements_39271, 0) || sle32(arg_31616,
                                                    starting_point_39270)) {
        chunk_sizze_39269 = 0;
    } else {
        if (slt32(arg_31616, (srem32(global_tid_36423, group_sizze_36582 *
                                     squot32(num_groups_36592 + smax32(1,
                                                                       sizze_31215) -
                                             1, smax32(1, sizze_31215))) + 1) *
                  squot32(arg_31616 + group_sizze_36582 *
                          squot32(num_groups_36592 + smax32(1, sizze_31215) - 1,
                                  smax32(1, sizze_31215)) - 1,
                          group_sizze_36582 * squot32(num_groups_36592 +
                                                      smax32(1, sizze_31215) -
                                                      1, smax32(1,
                                                                sizze_31215))))) {
            chunk_sizze_39269 = arg_31616 - srem32(global_tid_36423,
                                                   group_sizze_36582 *
                                                   squot32(num_groups_36592 +
                                                           smax32(1,
                                                                  sizze_31215) -
                                                           1, smax32(1,
                                                                     sizze_31215))) *
                squot32(arg_31616 + group_sizze_36582 *
                        squot32(num_groups_36592 + smax32(1, sizze_31215) - 1,
                                smax32(1, sizze_31215)) - 1, group_sizze_36582 *
                        squot32(num_groups_36592 + smax32(1, sizze_31215) - 1,
                                smax32(1, sizze_31215)));
        } else {
            chunk_sizze_39269 = squot32(arg_31616 + group_sizze_36582 *
                                        squot32(num_groups_36592 + smax32(1,
                                                                          sizze_31215) -
                                                1, smax32(1, sizze_31215)) - 1,
                                        group_sizze_36582 *
                                        squot32(num_groups_36592 + smax32(1,
                                                                          sizze_31215) -
                                                1, smax32(1, sizze_31215)));
        }
    }
    
    bool x_36601;
    int32_t x_36602;
    float x_36603;
    bool x_36604;
    int32_t x_36605;
    float x_36606;
    
    x_36601 = 0;
    x_36602 = -1;
    x_36603 = 0.0F;
    for (int32_t i_39285 = 0; i_39285 < squot32(arg_31616 + group_sizze_36582 *
                                                squot32(num_groups_36592 +
                                                        smax32(1, sizze_31215) -
                                                        1, smax32(1,
                                                                  sizze_31215)) -
                                                1, group_sizze_36582 *
                                                squot32(num_groups_36592 +
                                                        smax32(1, sizze_31215) -
                                                        1, smax32(1,
                                                                  sizze_31215)));
         i_39285++) {
        gtid_36422 = local_tid_36424 + (squot32(srem32(global_tid_36423,
                                                       group_sizze_36582 *
                                                       squot32(num_groups_36592 +
                                                               smax32(1,
                                                                      sizze_31215) -
                                                               1, smax32(1,
                                                                         sizze_31215))),
                                                group_sizze_36582) *
                                        squot32(arg_31616 + group_sizze_36582 *
                                                squot32(num_groups_36592 +
                                                        smax32(1, sizze_31215) -
                                                        1, smax32(1,
                                                                  sizze_31215)) -
                                                1, group_sizze_36582 *
                                                squot32(num_groups_36592 +
                                                        smax32(1, sizze_31215) -
                                                        1, smax32(1,
                                                                  sizze_31215))) +
                                        i_39285) * group_sizze_36582;
        if (slt32(gtid_36422, arg_31616)) {
            // apply map function
            {
                int32_t y_36614;
                float y_36615;
                float x_36619;
                float x_36620;
                float res_36623;
                bool cond_36624;
                bool res_36625;
                bool res_36626;
                bool x_36627;
                float res_36628;
                bool res_36629;
                bool x_36630;
                float res_36631;
                
                y_36614 = *(__global int32_t *) &mem_38437[gtid_36399 * 4];
                y_36615 = *(__global float *) &mem_38434[gtid_36399 * 4];
                x_36619 = *(__global float *) &mem_38441[(gtid_36399 *
                                                          arg_31616 +
                                                          gtid_36422) * 4];
                x_36620 = *(__global float *) &mem_38361[gtid_36422 * 4];
                res_36623 = x_36619 / y_36615;
                cond_36624 = slt32(gtid_36422, y_36614);
                res_36625 = futrts_isnan32(res_36623);
                res_36626 = !res_36625;
                x_36627 = cond_36624 && res_36626;
                res_36628 = (float) fabs(res_36623);
                res_36629 = x_36620 < res_36628;
                x_36630 = x_36627 && res_36629;
                if (cond_36624) {
                    res_36631 = res_36623;
                } else {
                    res_36631 = 0.0F;
                }
                // save results to be reduced
                {
                    x_36604 = x_36630;
                    x_36605 = gtid_36422;
                    x_36606 = res_36631;
                }
                // save map-out results
                {
                    if (1) {
                        *(__global float *) &mem_38453[(gtid_36399 * arg_31616 +
                                                        gtid_36422) * 4] =
                            res_36623;
                    }
                }
                // apply reduction operator
                {
                    bool res_36607;
                    int32_t res_36608;
                    float res_36613;
                    
                    if (x_36601) {
                        res_36607 = x_36601;
                        res_36608 = x_36602;
                    } else {
                        bool x_36609;
                        bool y_36610;
                        bool res_36611;
                        int32_t res_36612;
                        
                        x_36609 = !x_36604;
                        y_36610 = x_36601 && x_36609;
                        res_36611 = x_36604 || y_36610;
                        if (x_36604) {
                            res_36612 = x_36605;
                        } else {
                            res_36612 = x_36602;
                        }
                        res_36607 = res_36611;
                        res_36608 = res_36612;
                    }
                    res_36613 = x_36603 + x_36606;
                    x_36601 = res_36607;
                    x_36602 = res_36608;
                    x_36603 = res_36613;
                }
            }
        }
        // to reduce current chunk, first store our result to memory
        {
            *(__local bool *) &red_arr_mem_39261[local_tid_36424] = x_36601;
            *(__local int32_t *) &red_arr_mem_39263[local_tid_36424 * 4] =
                x_36602;
            *(__local float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                x_36603;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        int32_t offset_39286;
        int32_t skip_waves_39287;
        bool x_39272;
        int32_t x_39273;
        float x_39274;
        bool x_39275;
        int32_t x_39276;
        float x_39277;
        
        offset_39286 = 0;
        // participating threads read initial accumulator
        {
            if (slt32(local_tid_36424, group_sizze_36582)) {
                x_39272 = *(__local bool *) &red_arr_mem_39261[local_tid_36424 +
                                                               offset_39286];
                x_39273 = *(__local
                            int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                           offset_39286) * 4];
                x_39274 = *(__local
                            float *) &red_arr_mem_39265[(local_tid_36424 +
                                                         offset_39286) * 4];
            }
        }
        offset_39286 = 1;
        while (slt32(offset_39286, wave_sizze_39248)) {
            if (slt32(local_tid_36424 + offset_39286, group_sizze_36582) &&
                ((local_tid_36424 - squot32(local_tid_36424, wave_sizze_39248) *
                  wave_sizze_39248) & (2 * offset_39286 - 1)) == 0) {
                // read array element
                {
                    x_39275 = *(volatile __local
                                bool *) &red_arr_mem_39261[local_tid_36424 +
                                                           offset_39286];
                    x_39276 = *(volatile __local
                                int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                               offset_39286) *
                                                              4];
                    x_39277 = *(volatile __local
                                float *) &red_arr_mem_39265[(local_tid_36424 +
                                                             offset_39286) * 4];
                }
                // apply reduction operation
                {
                    bool res_39278;
                    int32_t res_39279;
                    float res_39284;
                    
                    if (x_39272) {
                        res_39278 = x_39272;
                        res_39279 = x_39273;
                    } else {
                        bool x_39280;
                        bool y_39281;
                        bool res_39282;
                        int32_t res_39283;
                        
                        x_39280 = !x_39275;
                        y_39281 = x_39272 && x_39280;
                        res_39282 = x_39275 || y_39281;
                        if (x_39275) {
                            res_39283 = x_39276;
                        } else {
                            res_39283 = x_39273;
                        }
                        res_39278 = res_39282;
                        res_39279 = res_39283;
                    }
                    res_39284 = x_39274 + x_39277;
                    x_39272 = res_39278;
                    x_39273 = res_39279;
                    x_39274 = res_39284;
                }
                // write result of operation
                {
                    *(volatile __local
                      bool *) &red_arr_mem_39261[local_tid_36424] = x_39272;
                    *(volatile __local
                      int32_t *) &red_arr_mem_39263[local_tid_36424 * 4] =
                        x_39273;
                    *(volatile __local
                      float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                        x_39274;
                }
            }
            offset_39286 *= 2;
        }
        skip_waves_39287 = 1;
        while (slt32(skip_waves_39287, squot32(group_sizze_36582 +
                                               wave_sizze_39248 - 1,
                                               wave_sizze_39248))) {
            barrier(CLK_LOCAL_MEM_FENCE);
            offset_39286 = skip_waves_39287 * wave_sizze_39248;
            if (slt32(local_tid_36424 + offset_39286, group_sizze_36582) &&
                ((local_tid_36424 - squot32(local_tid_36424, wave_sizze_39248) *
                  wave_sizze_39248) == 0 && (squot32(local_tid_36424,
                                                     wave_sizze_39248) & (2 *
                                                                          skip_waves_39287 -
                                                                          1)) ==
                 0)) {
                // read array element
                {
                    x_39275 = *(__local
                                bool *) &red_arr_mem_39261[local_tid_36424 +
                                                           offset_39286];
                    x_39276 = *(__local
                                int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                               offset_39286) *
                                                              4];
                    x_39277 = *(__local
                                float *) &red_arr_mem_39265[(local_tid_36424 +
                                                             offset_39286) * 4];
                }
                // apply reduction operation
                {
                    bool res_39278;
                    int32_t res_39279;
                    float res_39284;
                    
                    if (x_39272) {
                        res_39278 = x_39272;
                        res_39279 = x_39273;
                    } else {
                        bool x_39280;
                        bool y_39281;
                        bool res_39282;
                        int32_t res_39283;
                        
                        x_39280 = !x_39275;
                        y_39281 = x_39272 && x_39280;
                        res_39282 = x_39275 || y_39281;
                        if (x_39275) {
                            res_39283 = x_39276;
                        } else {
                            res_39283 = x_39273;
                        }
                        res_39278 = res_39282;
                        res_39279 = res_39283;
                    }
                    res_39284 = x_39274 + x_39277;
                    x_39272 = res_39278;
                    x_39273 = res_39279;
                    x_39274 = res_39284;
                }
                // write result of operation
                {
                    *(__local bool *) &red_arr_mem_39261[local_tid_36424] =
                        x_39272;
                    *(__local int32_t *) &red_arr_mem_39263[local_tid_36424 *
                                                            4] = x_39273;
                    *(__local float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                        x_39274;
                }
            }
            skip_waves_39287 *= 2;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // first thread takes carry-out; others neutral element
        {
            if (local_tid_36424 == 0) {
                x_36601 = x_39272;
                x_36602 = x_39273;
                x_36603 = x_39274;
            } else {
                x_36601 = 0;
                x_36602 = -1;
                x_36603 = 0.0F;
            }
        }
    }
    if (squot32(num_groups_36592 + smax32(1, sizze_31215) - 1, smax32(1,
                                                                      sizze_31215)) ==
        1) {
        // first thread in group saves final result to memory
        {
            if (local_tid_36424 == 0) {
                *(__global bool *) &mem_38443[gtid_36399] = x_36601;
                *(__global int32_t *) &mem_38446[gtid_36399 * 4] = x_36602;
                *(__global float *) &mem_38449[gtid_36399 * 4] = x_36603;
            }
        }
    } else {
        int32_t old_counter_39288;
        
        // first thread in group saves group result to memory
        {
            if (local_tid_36424 == 0) {
                *(__global bool *) &group_res_arr_mem_39253[group_id_36425] =
                    x_36601;
                *(__global int32_t *) &group_res_arr_mem_39255[group_id_36425 *
                                                               4] = x_36602;
                *(__global float *) &group_res_arr_mem_39257[group_id_36425 *
                                                             4] = x_36603;
                mem_fence_global();
                old_counter_39288 = atomic_add((volatile __global
                                                int *) &counter_mem_39259[srem32(squot32(group_id_36425,
                                                                                         squot32(num_groups_36592 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               1);
                *(__local bool *) &sync_arr_mem_39267[0] = old_counter_39288 ==
                    squot32(num_groups_36592 + smax32(1, sizze_31215) - 1,
                            smax32(1, sizze_31215)) - 1;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        
        bool is_last_group_39289 = *(__local bool *) &sync_arr_mem_39267[0];
        
        if (is_last_group_39289) {
            if (local_tid_36424 == 0) {
                old_counter_39288 = atomic_add((volatile __global
                                                int *) &counter_mem_39259[srem32(squot32(group_id_36425,
                                                                                         squot32(num_groups_36592 +
                                                                                                 smax32(1,
                                                                                                        sizze_31215) -
                                                                                                 1,
                                                                                                 smax32(1,
                                                                                                        sizze_31215))),
                                                                                 1024) *
                                                                          4],
                                               0 - squot32(num_groups_36592 +
                                                           smax32(1,
                                                                  sizze_31215) -
                                                           1, smax32(1,
                                                                     sizze_31215)));
            }
            // read in the per-group-results
            {
                if (slt32(local_tid_36424, squot32(num_groups_36592 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_36601 = *(__global
                                bool *) &group_res_arr_mem_39253[squot32(group_id_36425,
                                                                         squot32(num_groups_36592 +
                                                                                 smax32(1,
                                                                                        sizze_31215) -
                                                                                 1,
                                                                                 smax32(1,
                                                                                        sizze_31215))) *
                                                                 squot32(num_groups_36592 +
                                                                         smax32(1,
                                                                                sizze_31215) -
                                                                         1,
                                                                         smax32(1,
                                                                                sizze_31215)) +
                                                                 local_tid_36424];
                } else {
                    x_36601 = 0;
                }
                *(__local bool *) &red_arr_mem_39261[local_tid_36424] = x_36601;
                if (slt32(local_tid_36424, squot32(num_groups_36592 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_36602 = *(__global
                                int32_t *) &group_res_arr_mem_39255[(squot32(group_id_36425,
                                                                             squot32(num_groups_36592 +
                                                                                     smax32(1,
                                                                                            sizze_31215) -
                                                                                     1,
                                                                                     smax32(1,
                                                                                            sizze_31215))) *
                                                                     squot32(num_groups_36592 +
                                                                             smax32(1,
                                                                                    sizze_31215) -
                                                                             1,
                                                                             smax32(1,
                                                                                    sizze_31215)) +
                                                                     local_tid_36424) *
                                                                    4];
                } else {
                    x_36602 = -1;
                }
                *(__local int32_t *) &red_arr_mem_39263[local_tid_36424 * 4] =
                    x_36602;
                if (slt32(local_tid_36424, squot32(num_groups_36592 + smax32(1,
                                                                             sizze_31215) -
                                                   1, smax32(1,
                                                             sizze_31215)))) {
                    x_36603 = *(__global
                                float *) &group_res_arr_mem_39257[(squot32(group_id_36425,
                                                                           squot32(num_groups_36592 +
                                                                                   smax32(1,
                                                                                          sizze_31215) -
                                                                                   1,
                                                                                   smax32(1,
                                                                                          sizze_31215))) *
                                                                   squot32(num_groups_36592 +
                                                                           smax32(1,
                                                                                  sizze_31215) -
                                                                           1,
                                                                           smax32(1,
                                                                                  sizze_31215)) +
                                                                   local_tid_36424) *
                                                                  4];
                } else {
                    x_36603 = 0.0F;
                }
                *(__local float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                    x_36603;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            // reduce the per-group results
            {
                int32_t offset_39290;
                int32_t skip_waves_39291;
                bool x_39272;
                int32_t x_39273;
                float x_39274;
                bool x_39275;
                int32_t x_39276;
                float x_39277;
                
                offset_39290 = 0;
                // participating threads read initial accumulator
                {
                    if (slt32(local_tid_36424, group_sizze_36582)) {
                        x_39272 = *(__local
                                    bool *) &red_arr_mem_39261[local_tid_36424 +
                                                               offset_39290];
                        x_39273 = *(__local
                                    int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                                   offset_39290) *
                                                                  4];
                        x_39274 = *(__local
                                    float *) &red_arr_mem_39265[(local_tid_36424 +
                                                                 offset_39290) *
                                                                4];
                    }
                }
                offset_39290 = 1;
                while (slt32(offset_39290, wave_sizze_39248)) {
                    if (slt32(local_tid_36424 + offset_39290,
                              group_sizze_36582) && ((local_tid_36424 -
                                                      squot32(local_tid_36424,
                                                              wave_sizze_39248) *
                                                      wave_sizze_39248) & (2 *
                                                                           offset_39290 -
                                                                           1)) ==
                        0) {
                        // read array element
                        {
                            x_39275 = *(volatile __local
                                        bool *) &red_arr_mem_39261[local_tid_36424 +
                                                                   offset_39290];
                            x_39276 = *(volatile __local
                                        int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                                       offset_39290) *
                                                                      4];
                            x_39277 = *(volatile __local
                                        float *) &red_arr_mem_39265[(local_tid_36424 +
                                                                     offset_39290) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            bool res_39278;
                            int32_t res_39279;
                            float res_39284;
                            
                            if (x_39272) {
                                res_39278 = x_39272;
                                res_39279 = x_39273;
                            } else {
                                bool x_39280;
                                bool y_39281;
                                bool res_39282;
                                int32_t res_39283;
                                
                                x_39280 = !x_39275;
                                y_39281 = x_39272 && x_39280;
                                res_39282 = x_39275 || y_39281;
                                if (x_39275) {
                                    res_39283 = x_39276;
                                } else {
                                    res_39283 = x_39273;
                                }
                                res_39278 = res_39282;
                                res_39279 = res_39283;
                            }
                            res_39284 = x_39274 + x_39277;
                            x_39272 = res_39278;
                            x_39273 = res_39279;
                            x_39274 = res_39284;
                        }
                        // write result of operation
                        {
                            *(volatile __local
                              bool *) &red_arr_mem_39261[local_tid_36424] =
                                x_39272;
                            *(volatile __local
                              int32_t *) &red_arr_mem_39263[local_tid_36424 *
                                                            4] = x_39273;
                            *(volatile __local
                              float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                                x_39274;
                        }
                    }
                    offset_39290 *= 2;
                }
                skip_waves_39291 = 1;
                while (slt32(skip_waves_39291, squot32(group_sizze_36582 +
                                                       wave_sizze_39248 - 1,
                                                       wave_sizze_39248))) {
                    barrier(CLK_LOCAL_MEM_FENCE);
                    offset_39290 = skip_waves_39291 * wave_sizze_39248;
                    if (slt32(local_tid_36424 + offset_39290,
                              group_sizze_36582) && ((local_tid_36424 -
                                                      squot32(local_tid_36424,
                                                              wave_sizze_39248) *
                                                      wave_sizze_39248) == 0 &&
                                                     (squot32(local_tid_36424,
                                                              wave_sizze_39248) &
                                                      (2 * skip_waves_39291 -
                                                       1)) == 0)) {
                        // read array element
                        {
                            x_39275 = *(__local
                                        bool *) &red_arr_mem_39261[local_tid_36424 +
                                                                   offset_39290];
                            x_39276 = *(__local
                                        int32_t *) &red_arr_mem_39263[(local_tid_36424 +
                                                                       offset_39290) *
                                                                      4];
                            x_39277 = *(__local
                                        float *) &red_arr_mem_39265[(local_tid_36424 +
                                                                     offset_39290) *
                                                                    4];
                        }
                        // apply reduction operation
                        {
                            bool res_39278;
                            int32_t res_39279;
                            float res_39284;
                            
                            if (x_39272) {
                                res_39278 = x_39272;
                                res_39279 = x_39273;
                            } else {
                                bool x_39280;
                                bool y_39281;
                                bool res_39282;
                                int32_t res_39283;
                                
                                x_39280 = !x_39275;
                                y_39281 = x_39272 && x_39280;
                                res_39282 = x_39275 || y_39281;
                                if (x_39275) {
                                    res_39283 = x_39276;
                                } else {
                                    res_39283 = x_39273;
                                }
                                res_39278 = res_39282;
                                res_39279 = res_39283;
                            }
                            res_39284 = x_39274 + x_39277;
                            x_39272 = res_39278;
                            x_39273 = res_39279;
                            x_39274 = res_39284;
                        }
                        // write result of operation
                        {
                            *(__local
                              bool *) &red_arr_mem_39261[local_tid_36424] =
                                x_39272;
                            *(__local
                              int32_t *) &red_arr_mem_39263[local_tid_36424 *
                                                            4] = x_39273;
                            *(__local
                              float *) &red_arr_mem_39265[local_tid_36424 * 4] =
                                x_39274;
                        }
                    }
                    skip_waves_39291 *= 2;
                }
                // and back to memory with the final result
                {
                    if (local_tid_36424 == 0) {
                        *(__global bool *) &mem_38443[gtid_36399] = x_39272;
                        *(__global int32_t *) &mem_38446[gtid_36399 * 4] =
                            x_39273;
                        *(__global float *) &mem_38449[gtid_36399 * 4] =
                            x_39274;
                    }
                }
            }
        }
    }
}
__kernel void segred_nonseg_35595(int32_t sizze_31215, int32_t num_groups_35589,
                                  __global unsigned char *res_mem_38339,
                                  __global unsigned char *mem_38344, __global
                                  unsigned char *counter_mem_39076, __global
                                  unsigned char *group_res_arr_mem_39078,
                                  int32_t num_threads_39080)
{
    const int32_t group_sizze_35578 = mainzigroup_sizze_35577;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39081_backing_0, 4 *
                         mainzigroup_sizze_35577);
    ALIGNED_LOCAL_MEMORY(sync_arr_mem_39083_backing_1, 1);
    
    int32_t global_tid_35595;
    int32_t local_tid_35596;
    int32_t group_sizze_39075;
    int32_t wave_sizze_39074;
    int32_t group_id_35597;
    
    global_tid_35595 = get_global_id(0);
    local_tid_35596 = get_local_id(0);
    group_sizze_39075 = get_local_size(0);
    wave_sizze_39074 = LOCKSTEP_WIDTH;
    group_id_35597 = get_group_id(0);
    
    int32_t dummy_35575;
    int32_t gtid_35594;
    __local char *red_arr_mem_39081;
    
    red_arr_mem_39081 = (__local char *) red_arr_mem_39081_backing_0;
    
    __local char *sync_arr_mem_39083;
    
    sync_arr_mem_39083 = (__local char *) sync_arr_mem_39083_backing_1;
    dummy_35575 = 0;
    
    int32_t chunk_sizze_39085 = smin32(squot32(sizze_31215 + group_sizze_35578 *
                                               num_groups_35589 - 1,
                                               group_sizze_35578 *
                                               num_groups_35589),
                                       squot32(sizze_31215 - global_tid_35595 +
                                               num_threads_39080 - 1,
                                               num_threads_39080));
    int32_t x_31595;
    int32_t x_31596;
    
    x_31595 = 0;
    for (int32_t i_39089 = 0; i_39089 < chunk_sizze_39085; i_39089++) {
        gtid_35594 = global_tid_35595 + num_threads_39080 * i_39089;
        // apply map function
        {
            int32_t x_31598 = *(__global int32_t *) &res_mem_38339[gtid_35594 *
                                                                   4];
            
            // save results to be reduced
            {
                x_31596 = x_31598;
            }
            // save map-out results
            { }
            // apply reduction operator
            {
                int32_t res_31597 = smax32(x_31595, x_31596);
                
                x_31595 = res_31597;
            }
        }
    }
    // to reduce current chunk, first store our result to memory
    {
        *(__local int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] = x_31595;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t offset_39090;
    int32_t skip_waves_39091;
    int32_t x_39086;
    int32_t x_39087;
    
    offset_39090 = 0;
    // participating threads read initial accumulator
    {
        if (slt32(local_tid_35596, group_sizze_35578)) {
            x_39086 = *(__local int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                               offset_39090) *
                                                              4];
        }
    }
    offset_39090 = 1;
    while (slt32(offset_39090, wave_sizze_39074)) {
        if (slt32(local_tid_35596 + offset_39090, group_sizze_35578) &&
            ((local_tid_35596 - squot32(local_tid_35596, wave_sizze_39074) *
              wave_sizze_39074) & (2 * offset_39090 - 1)) == 0) {
            // read array element
            {
                x_39087 = *(volatile __local
                            int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                           offset_39090) * 4];
            }
            // apply reduction operation
            {
                int32_t res_39088 = smax32(x_39086, x_39087);
                
                x_39086 = res_39088;
            }
            // write result of operation
            {
                *(volatile __local
                  int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] = x_39086;
            }
        }
        offset_39090 *= 2;
    }
    skip_waves_39091 = 1;
    while (slt32(skip_waves_39091, squot32(group_sizze_35578 +
                                           wave_sizze_39074 - 1,
                                           wave_sizze_39074))) {
        barrier(CLK_LOCAL_MEM_FENCE);
        offset_39090 = skip_waves_39091 * wave_sizze_39074;
        if (slt32(local_tid_35596 + offset_39090, group_sizze_35578) &&
            ((local_tid_35596 - squot32(local_tid_35596, wave_sizze_39074) *
              wave_sizze_39074) == 0 && (squot32(local_tid_35596,
                                                 wave_sizze_39074) & (2 *
                                                                      skip_waves_39091 -
                                                                      1)) ==
             0)) {
            // read array element
            {
                x_39087 = *(__local
                            int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                           offset_39090) * 4];
            }
            // apply reduction operation
            {
                int32_t res_39088 = smax32(x_39086, x_39087);
                
                x_39086 = res_39088;
            }
            // write result of operation
            {
                *(__local int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] =
                    x_39086;
            }
        }
        skip_waves_39091 *= 2;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    int32_t old_counter_39092;
    
    // first thread in group saves group result to memory
    {
        if (local_tid_35596 == 0) {
            *(__global int32_t *) &group_res_arr_mem_39078[group_id_35597 * 4] =
                x_39086;
            mem_fence_global();
            old_counter_39092 = atomic_add((volatile __global
                                            int *) &counter_mem_39076[0], 1);
            *(__local bool *) &sync_arr_mem_39083[0] = old_counter_39092 ==
                num_groups_35589 - 1;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    
    bool is_last_group_39093 = *(__local bool *) &sync_arr_mem_39083[0];
    
    if (is_last_group_39093) {
        if (local_tid_35596 == 0) {
            old_counter_39092 = atomic_add((volatile __global
                                            int *) &counter_mem_39076[0], 0 -
                                           num_groups_35589);
        }
        // read in the per-group-results
        {
            if (slt32(local_tid_35596, num_groups_35589)) {
                x_31595 = *(__global
                            int32_t *) &group_res_arr_mem_39078[local_tid_35596 *
                                                                4];
            } else {
                x_31595 = 0;
            }
            *(__local int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] =
                x_31595;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // reduce the per-group results
        {
            int32_t offset_39094;
            int32_t skip_waves_39095;
            int32_t x_39086;
            int32_t x_39087;
            
            offset_39094 = 0;
            // participating threads read initial accumulator
            {
                if (slt32(local_tid_35596, group_sizze_35578)) {
                    x_39086 = *(__local
                                int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                               offset_39094) *
                                                              4];
                }
            }
            offset_39094 = 1;
            while (slt32(offset_39094, wave_sizze_39074)) {
                if (slt32(local_tid_35596 + offset_39094, group_sizze_35578) &&
                    ((local_tid_35596 - squot32(local_tid_35596,
                                                wave_sizze_39074) *
                      wave_sizze_39074) & (2 * offset_39094 - 1)) == 0) {
                    // read array element
                    {
                        x_39087 = *(volatile __local
                                    int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                                   offset_39094) *
                                                                  4];
                    }
                    // apply reduction operation
                    {
                        int32_t res_39088 = smax32(x_39086, x_39087);
                        
                        x_39086 = res_39088;
                    }
                    // write result of operation
                    {
                        *(volatile __local
                          int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] =
                            x_39086;
                    }
                }
                offset_39094 *= 2;
            }
            skip_waves_39095 = 1;
            while (slt32(skip_waves_39095, squot32(group_sizze_35578 +
                                                   wave_sizze_39074 - 1,
                                                   wave_sizze_39074))) {
                barrier(CLK_LOCAL_MEM_FENCE);
                offset_39094 = skip_waves_39095 * wave_sizze_39074;
                if (slt32(local_tid_35596 + offset_39094, group_sizze_35578) &&
                    ((local_tid_35596 - squot32(local_tid_35596,
                                                wave_sizze_39074) *
                      wave_sizze_39074) == 0 && (squot32(local_tid_35596,
                                                         wave_sizze_39074) &
                                                 (2 * skip_waves_39095 - 1)) ==
                     0)) {
                    // read array element
                    {
                        x_39087 = *(__local
                                    int32_t *) &red_arr_mem_39081[(local_tid_35596 +
                                                                   offset_39094) *
                                                                  4];
                    }
                    // apply reduction operation
                    {
                        int32_t res_39088 = smax32(x_39086, x_39087);
                        
                        x_39086 = res_39088;
                    }
                    // write result of operation
                    {
                        *(__local
                          int32_t *) &red_arr_mem_39081[local_tid_35596 * 4] =
                            x_39086;
                    }
                }
                skip_waves_39095 *= 2;
            }
            // and back to memory with the final result
            {
                if (local_tid_35596 == 0) {
                    *(__global int32_t *) &mem_38344[0] = x_39086;
                }
            }
        }
    }
}
__kernel void segred_small_32822(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t sizze_31216, int32_t n_31219,
                                 int32_t res_31237, int32_t num_groups_33119,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *arg_mem_37903, __global
                                 unsigned char *mem_37995, __global
                                 unsigned char *mem_38000,
                                 int32_t segment_sizze_nonzzero_38654)
{
    const int32_t group_sizze_33109 = mainzigroup_sizze_32804;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38655_backing_0, 4 *
                         mainzigroup_sizze_32804);
    
    int32_t global_tid_32822;
    int32_t local_tid_32823;
    int32_t group_sizze_38653;
    int32_t wave_sizze_38652;
    int32_t group_id_32824;
    
    global_tid_32822 = get_global_id(0);
    local_tid_32823 = get_local_id(0);
    group_sizze_38653 = get_local_size(0);
    wave_sizze_38652 = LOCKSTEP_WIDTH;
    group_id_32824 = get_group_id(0);
    
    int32_t gtid_32791;
    int32_t gtid_32792;
    int32_t gtid_32793;
    int32_t gtid_32821;
    __local char *red_arr_mem_38655;
    
    red_arr_mem_38655 = (__local char *) red_arr_mem_38655_backing_0;
    for (int32_t i_38657 = 0; i_38657 < squot32(squot32(sizze_31215 *
                                                        res_31237 * res_31237 +
                                                        squot32(group_sizze_33109,
                                                                segment_sizze_nonzzero_38654) -
                                                        1,
                                                        squot32(group_sizze_33109,
                                                                segment_sizze_nonzzero_38654)) -
                                                group_id_32824 +
                                                num_groups_33119 - 1,
                                                num_groups_33119); i_38657++) {
        gtid_32791 = squot32(squot32(local_tid_32823,
                                     segment_sizze_nonzzero_38654) +
                             (group_id_32824 + i_38657 * num_groups_33119) *
                             squot32(group_sizze_33109,
                                     segment_sizze_nonzzero_38654), res_31237 *
                             res_31237);
        gtid_32792 = squot32(squot32(local_tid_32823,
                                     segment_sizze_nonzzero_38654) +
                             (group_id_32824 + i_38657 * num_groups_33119) *
                             squot32(group_sizze_33109,
                                     segment_sizze_nonzzero_38654) -
                             squot32(squot32(local_tid_32823,
                                             segment_sizze_nonzzero_38654) +
                                     (group_id_32824 + i_38657 *
                                      num_groups_33119) *
                                     squot32(group_sizze_33109,
                                             segment_sizze_nonzzero_38654),
                                     res_31237 * res_31237) * (res_31237 *
                                                               res_31237),
                             res_31237);
        gtid_32793 = squot32(local_tid_32823, segment_sizze_nonzzero_38654) +
            (group_id_32824 + i_38657 * num_groups_33119) *
            squot32(group_sizze_33109, segment_sizze_nonzzero_38654) -
            squot32(squot32(local_tid_32823, segment_sizze_nonzzero_38654) +
                    (group_id_32824 + i_38657 * num_groups_33119) *
                    squot32(group_sizze_33109, segment_sizze_nonzzero_38654),
                    res_31237 * res_31237) * (res_31237 * res_31237) -
            squot32(squot32(local_tid_32823, segment_sizze_nonzzero_38654) +
                    (group_id_32824 + i_38657 * num_groups_33119) *
                    squot32(group_sizze_33109, segment_sizze_nonzzero_38654) -
                    squot32(squot32(local_tid_32823,
                                    segment_sizze_nonzzero_38654) +
                            (group_id_32824 + i_38657 * num_groups_33119) *
                            squot32(group_sizze_33109,
                                    segment_sizze_nonzzero_38654), res_31237 *
                            res_31237) * (res_31237 * res_31237), res_31237) *
            res_31237;
        gtid_32821 = srem32(local_tid_32823, n_31219);
        // apply map function if in bounds
        {
            if (slt32(0, n_31219) && (((slt32(gtid_32791, sizze_31215) &&
                                        slt32(gtid_32792, res_31237)) &&
                                       slt32(gtid_32793, res_31237)) &&
                                      slt32(local_tid_32823, n_31219 *
                                            squot32(group_sizze_33109,
                                                    segment_sizze_nonzzero_38654)))) {
                float x_33131;
                float x_33132;
                float x_33133;
                float x_33134;
                bool res_33135;
                float y_33136;
                float res_33137;
                
                x_33131 = *(__global float *) &images_mem_37894[(gtid_32791 *
                                                                 sizze_31216 +
                                                                 gtid_32821) *
                                                                4];
                x_33132 = *(__global float *) &arg_mem_37903[(gtid_32792 *
                                                              sizze_31214 +
                                                              gtid_32821) * 4];
                x_33133 = *(__global float *) &mem_37995[(gtid_32793 *
                                                          sizze_31214 +
                                                          gtid_32821) * 4];
                x_33134 = x_33132 * x_33133;
                res_33135 = futrts_isnan32(x_33131);
                if (res_33135) {
                    y_33136 = 0.0F;
                } else {
                    y_33136 = 1.0F;
                }
                res_33137 = x_33134 * y_33136;
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_38655[local_tid_32823 * 4] =
                        res_33137;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_38655[local_tid_32823 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, n_31219)) {
            // perform segmented scan to imitate reduction
            {
                float x_33125;
                float x_33126;
                float x_38658;
                float x_38659;
                int32_t skip_threads_38661;
                
                if (slt32(local_tid_32823, n_31219 * squot32(group_sizze_33109,
                                                             segment_sizze_nonzzero_38654))) {
                    x_33126 = *(volatile __local
                                float *) &red_arr_mem_38655[local_tid_32823 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_38661 = 1;
                    while (slt32(skip_threads_38661, 32)) {
                        if (sle32(skip_threads_38661, local_tid_32823 -
                                  squot32(local_tid_32823, 32) * 32) &&
                            slt32(local_tid_32823, n_31219 *
                                  squot32(group_sizze_33109,
                                          segment_sizze_nonzzero_38654))) {
                            // read operands
                            {
                                x_33125 = *(volatile __local
                                            float *) &red_arr_mem_38655[(local_tid_32823 -
                                                                         skip_threads_38661) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_32823, n_31219),
                                           local_tid_32823 - (local_tid_32823 -
                                                              skip_threads_38661))) {
                                    float res_33127 = x_33125 + x_33126;
                                    
                                    x_33126 = res_33127;
                                }
                            }
                        }
                        if (sle32(wave_sizze_38652, skip_threads_38661)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_38661, local_tid_32823 -
                                  squot32(local_tid_32823, 32) * 32) &&
                            slt32(local_tid_32823, n_31219 *
                                  squot32(group_sizze_33109,
                                          segment_sizze_nonzzero_38654))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_38655[local_tid_32823 *
                                                              sizeof(float)] =
                                    x_33126;
                            }
                        }
                        if (sle32(wave_sizze_38652, skip_threads_38661)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_38661 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_32823 - squot32(local_tid_32823, 32) * 32) ==
                        31 && slt32(local_tid_32823, n_31219 *
                                    squot32(group_sizze_33109,
                                            segment_sizze_nonzzero_38654))) {
                        *(volatile __local
                          float *) &red_arr_mem_38655[squot32(local_tid_32823,
                                                              32) *
                                                      sizeof(float)] = x_33126;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_38662;
                    
                    if (squot32(local_tid_32823, 32) == 0 &&
                        slt32(local_tid_32823, n_31219 *
                              squot32(group_sizze_33109,
                                      segment_sizze_nonzzero_38654))) {
                        x_38659 = *(volatile __local
                                    float *) &red_arr_mem_38655[local_tid_32823 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_38662 = 1;
                        while (slt32(skip_threads_38662, 32)) {
                            if (sle32(skip_threads_38662, local_tid_32823 -
                                      squot32(local_tid_32823, 32) * 32) &&
                                (squot32(local_tid_32823, 32) == 0 &&
                                 slt32(local_tid_32823, n_31219 *
                                       squot32(group_sizze_33109,
                                               segment_sizze_nonzzero_38654)))) {
                                // read operands
                                {
                                    x_38658 = *(volatile __local
                                                float *) &red_arr_mem_38655[(local_tid_32823 -
                                                                             skip_threads_38662) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_32823 * 32 +
                                                      32 - 1, n_31219),
                                               local_tid_32823 * 32 + 32 - 1 -
                                               ((local_tid_32823 -
                                                 skip_threads_38662) * 32 + 32 -
                                                1))) {
                                        float res_38660 = x_38658 + x_38659;
                                        
                                        x_38659 = res_38660;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_38652, skip_threads_38662)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_38662, local_tid_32823 -
                                      squot32(local_tid_32823, 32) * 32) &&
                                (squot32(local_tid_32823, 32) == 0 &&
                                 slt32(local_tid_32823, n_31219 *
                                       squot32(group_sizze_33109,
                                               segment_sizze_nonzzero_38654)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_38655[local_tid_32823 *
                                                                  sizeof(float)] =
                                        x_38659;
                                }
                            }
                            if (sle32(wave_sizze_38652, skip_threads_38662)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_38662 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_32823, 32) == 0 ||
                          !slt32(local_tid_32823, n_31219 *
                                 squot32(group_sizze_33109,
                                         segment_sizze_nonzzero_38654)))) {
                        // read operands
                        {
                            x_33125 = *(volatile __local
                                        float *) &red_arr_mem_38655[(squot32(local_tid_32823,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_32823, n_31219),
                                       local_tid_32823 -
                                       (squot32(local_tid_32823, 32) * 32 -
                                        1))) {
                                float res_33127 = x_33125 + x_33126;
                                
                                x_33126 = res_33127;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38655[local_tid_32823 *
                                                          sizeof(float)] =
                                x_33126;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_32823, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_38655[local_tid_32823 *
                                                      sizeof(float)] = x_33126;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_32824 + i_38657 * num_groups_33119) *
                      squot32(group_sizze_33109, segment_sizze_nonzzero_38654) +
                      local_tid_32823, sizze_31215 * res_31237 * res_31237) &&
                slt32(local_tid_32823, squot32(group_sizze_33109,
                                               segment_sizze_nonzzero_38654))) {
                *(__global float *) &mem_38000[(squot32((group_id_32824 +
                                                         i_38657 *
                                                         num_groups_33119) *
                                                        squot32(group_sizze_33109,
                                                                segment_sizze_nonzzero_38654) +
                                                        local_tid_32823,
                                                        res_31237 * res_31237) *
                                                (res_31237 * res_31237) +
                                                squot32((group_id_32824 +
                                                         i_38657 *
                                                         num_groups_33119) *
                                                        squot32(group_sizze_33109,
                                                                segment_sizze_nonzzero_38654) +
                                                        local_tid_32823 -
                                                        squot32((group_id_32824 +
                                                                 i_38657 *
                                                                 num_groups_33119) *
                                                                squot32(group_sizze_33109,
                                                                        segment_sizze_nonzzero_38654) +
                                                                local_tid_32823,
                                                                res_31237 *
                                                                res_31237) *
                                                        (res_31237 * res_31237),
                                                        res_31237) * res_31237 +
                                                ((group_id_32824 + i_38657 *
                                                  num_groups_33119) *
                                                 squot32(group_sizze_33109,
                                                         segment_sizze_nonzzero_38654) +
                                                 local_tid_32823 -
                                                 squot32((group_id_32824 +
                                                          i_38657 *
                                                          num_groups_33119) *
                                                         squot32(group_sizze_33109,
                                                                 segment_sizze_nonzzero_38654) +
                                                         local_tid_32823,
                                                         res_31237 *
                                                         res_31237) *
                                                 (res_31237 * res_31237) -
                                                 squot32((group_id_32824 +
                                                          i_38657 *
                                                          num_groups_33119) *
                                                         squot32(group_sizze_33109,
                                                                 segment_sizze_nonzzero_38654) +
                                                         local_tid_32823 -
                                                         squot32((group_id_32824 +
                                                                  i_38657 *
                                                                  num_groups_33119) *
                                                                 squot32(group_sizze_33109,
                                                                         segment_sizze_nonzzero_38654) +
                                                                 local_tid_32823,
                                                                 res_31237 *
                                                                 res_31237) *
                                                         (res_31237 *
                                                          res_31237),
                                                         res_31237) *
                                                 res_31237)) * 4] = *(__local
                                                                      float *) &red_arr_mem_38655[((local_tid_32823 +
                                                                                                    1) *
                                                                                                   segment_sizze_nonzzero_38654 -
                                                                                                   1) *
                                                                                                  4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_34038(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t sizze_31216, int32_t n_31219,
                                 int32_t res_31237, int32_t num_groups_34145,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *arg_mem_37903, __global
                                 unsigned char *mem_38082,
                                 int32_t segment_sizze_nonzzero_38739)
{
    const int32_t group_sizze_34135 = mainzigroup_sizze_34020;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38740_backing_0, 4 *
                         mainzigroup_sizze_34020);
    
    int32_t global_tid_34038;
    int32_t local_tid_34039;
    int32_t group_sizze_38738;
    int32_t wave_sizze_38737;
    int32_t group_id_34040;
    
    global_tid_34038 = get_global_id(0);
    local_tid_34039 = get_local_id(0);
    group_sizze_38738 = get_local_size(0);
    wave_sizze_38737 = LOCKSTEP_WIDTH;
    group_id_34040 = get_group_id(0);
    
    int32_t gtid_34011;
    int32_t gtid_34012;
    int32_t gtid_34037;
    __local char *red_arr_mem_38740;
    
    red_arr_mem_38740 = (__local char *) red_arr_mem_38740_backing_0;
    for (int32_t i_38742 = 0; i_38742 < squot32(squot32(sizze_31215 *
                                                        res_31237 +
                                                        squot32(group_sizze_34135,
                                                                segment_sizze_nonzzero_38739) -
                                                        1,
                                                        squot32(group_sizze_34135,
                                                                segment_sizze_nonzzero_38739)) -
                                                group_id_34040 +
                                                num_groups_34145 - 1,
                                                num_groups_34145); i_38742++) {
        gtid_34011 = squot32(squot32(local_tid_34039,
                                     segment_sizze_nonzzero_38739) +
                             (group_id_34040 + i_38742 * num_groups_34145) *
                             squot32(group_sizze_34135,
                                     segment_sizze_nonzzero_38739), res_31237);
        gtid_34012 = squot32(local_tid_34039, segment_sizze_nonzzero_38739) +
            (group_id_34040 + i_38742 * num_groups_34145) *
            squot32(group_sizze_34135, segment_sizze_nonzzero_38739) -
            squot32(squot32(local_tid_34039, segment_sizze_nonzzero_38739) +
                    (group_id_34040 + i_38742 * num_groups_34145) *
                    squot32(group_sizze_34135, segment_sizze_nonzzero_38739),
                    res_31237) * res_31237;
        gtid_34037 = srem32(local_tid_34039, n_31219);
        // apply map function if in bounds
        {
            if (slt32(0, n_31219) && ((slt32(gtid_34011, sizze_31215) &&
                                       slt32(gtid_34012, res_31237)) &&
                                      slt32(local_tid_34039, n_31219 *
                                            squot32(group_sizze_34135,
                                                    segment_sizze_nonzzero_38739)))) {
                float x_34156;
                float x_34157;
                bool res_34158;
                float res_34159;
                
                x_34156 = *(__global float *) &arg_mem_37903[(gtid_34012 *
                                                              sizze_31214 +
                                                              gtid_34037) * 4];
                x_34157 = *(__global float *) &images_mem_37894[(gtid_34011 *
                                                                 sizze_31216 +
                                                                 gtid_34037) *
                                                                4];
                res_34158 = futrts_isnan32(x_34157);
                if (res_34158) {
                    res_34159 = 0.0F;
                } else {
                    float res_34160 = x_34156 * x_34157;
                    
                    res_34159 = res_34160;
                }
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_38740[local_tid_34039 * 4] =
                        res_34159;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_38740[local_tid_34039 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, n_31219)) {
            // perform segmented scan to imitate reduction
            {
                float x_34151;
                float x_34152;
                float x_38743;
                float x_38744;
                int32_t skip_threads_38746;
                
                if (slt32(local_tid_34039, n_31219 * squot32(group_sizze_34135,
                                                             segment_sizze_nonzzero_38739))) {
                    x_34152 = *(volatile __local
                                float *) &red_arr_mem_38740[local_tid_34039 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_38746 = 1;
                    while (slt32(skip_threads_38746, 32)) {
                        if (sle32(skip_threads_38746, local_tid_34039 -
                                  squot32(local_tid_34039, 32) * 32) &&
                            slt32(local_tid_34039, n_31219 *
                                  squot32(group_sizze_34135,
                                          segment_sizze_nonzzero_38739))) {
                            // read operands
                            {
                                x_34151 = *(volatile __local
                                            float *) &red_arr_mem_38740[(local_tid_34039 -
                                                                         skip_threads_38746) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_34039, n_31219),
                                           local_tid_34039 - (local_tid_34039 -
                                                              skip_threads_38746))) {
                                    float res_34153 = x_34151 + x_34152;
                                    
                                    x_34152 = res_34153;
                                }
                            }
                        }
                        if (sle32(wave_sizze_38737, skip_threads_38746)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_38746, local_tid_34039 -
                                  squot32(local_tid_34039, 32) * 32) &&
                            slt32(local_tid_34039, n_31219 *
                                  squot32(group_sizze_34135,
                                          segment_sizze_nonzzero_38739))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_38740[local_tid_34039 *
                                                              sizeof(float)] =
                                    x_34152;
                            }
                        }
                        if (sle32(wave_sizze_38737, skip_threads_38746)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_38746 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_34039 - squot32(local_tid_34039, 32) * 32) ==
                        31 && slt32(local_tid_34039, n_31219 *
                                    squot32(group_sizze_34135,
                                            segment_sizze_nonzzero_38739))) {
                        *(volatile __local
                          float *) &red_arr_mem_38740[squot32(local_tid_34039,
                                                              32) *
                                                      sizeof(float)] = x_34152;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_38747;
                    
                    if (squot32(local_tid_34039, 32) == 0 &&
                        slt32(local_tid_34039, n_31219 *
                              squot32(group_sizze_34135,
                                      segment_sizze_nonzzero_38739))) {
                        x_38744 = *(volatile __local
                                    float *) &red_arr_mem_38740[local_tid_34039 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_38747 = 1;
                        while (slt32(skip_threads_38747, 32)) {
                            if (sle32(skip_threads_38747, local_tid_34039 -
                                      squot32(local_tid_34039, 32) * 32) &&
                                (squot32(local_tid_34039, 32) == 0 &&
                                 slt32(local_tid_34039, n_31219 *
                                       squot32(group_sizze_34135,
                                               segment_sizze_nonzzero_38739)))) {
                                // read operands
                                {
                                    x_38743 = *(volatile __local
                                                float *) &red_arr_mem_38740[(local_tid_34039 -
                                                                             skip_threads_38747) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_34039 * 32 +
                                                      32 - 1, n_31219),
                                               local_tid_34039 * 32 + 32 - 1 -
                                               ((local_tid_34039 -
                                                 skip_threads_38747) * 32 + 32 -
                                                1))) {
                                        float res_38745 = x_38743 + x_38744;
                                        
                                        x_38744 = res_38745;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_38737, skip_threads_38747)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_38747, local_tid_34039 -
                                      squot32(local_tid_34039, 32) * 32) &&
                                (squot32(local_tid_34039, 32) == 0 &&
                                 slt32(local_tid_34039, n_31219 *
                                       squot32(group_sizze_34135,
                                               segment_sizze_nonzzero_38739)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_38740[local_tid_34039 *
                                                                  sizeof(float)] =
                                        x_38744;
                                }
                            }
                            if (sle32(wave_sizze_38737, skip_threads_38747)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_38747 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_34039, 32) == 0 ||
                          !slt32(local_tid_34039, n_31219 *
                                 squot32(group_sizze_34135,
                                         segment_sizze_nonzzero_38739)))) {
                        // read operands
                        {
                            x_34151 = *(volatile __local
                                        float *) &red_arr_mem_38740[(squot32(local_tid_34039,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_34039, n_31219),
                                       local_tid_34039 -
                                       (squot32(local_tid_34039, 32) * 32 -
                                        1))) {
                                float res_34153 = x_34151 + x_34152;
                                
                                x_34152 = res_34153;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38740[local_tid_34039 *
                                                          sizeof(float)] =
                                x_34152;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_34039, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_38740[local_tid_34039 *
                                                      sizeof(float)] = x_34152;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_34040 + i_38742 * num_groups_34145) *
                      squot32(group_sizze_34135, segment_sizze_nonzzero_38739) +
                      local_tid_34039, sizze_31215 * res_31237) &&
                slt32(local_tid_34039, squot32(group_sizze_34135,
                                               segment_sizze_nonzzero_38739))) {
                *(__global float *) &mem_38082[(squot32((group_id_34040 +
                                                         i_38742 *
                                                         num_groups_34145) *
                                                        squot32(group_sizze_34135,
                                                                segment_sizze_nonzzero_38739) +
                                                        local_tid_34039,
                                                        res_31237) * res_31237 +
                                                ((group_id_34040 + i_38742 *
                                                  num_groups_34145) *
                                                 squot32(group_sizze_34135,
                                                         segment_sizze_nonzzero_38739) +
                                                 local_tid_34039 -
                                                 squot32((group_id_34040 +
                                                          i_38742 *
                                                          num_groups_34145) *
                                                         squot32(group_sizze_34135,
                                                                 segment_sizze_nonzzero_38739) +
                                                         local_tid_34039,
                                                         res_31237) *
                                                 res_31237)) * 4] = *(__local
                                                                      float *) &red_arr_mem_38740[((local_tid_34039 +
                                                                                                    1) *
                                                                                                   segment_sizze_nonzzero_38739 -
                                                                                                   1) *
                                                                                                  4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_34370(int32_t sizze_31215, int32_t res_31237,
                                 int32_t j_m_i_31370, int32_t num_groups_34471,
                                 __global unsigned char *res_mem_38037, __global
                                 unsigned char *res_mem_38086, __global
                                 unsigned char *mem_38138,
                                 int32_t segment_sizze_nonzzero_38800)
{
    const int32_t group_sizze_34461 = mainzigroup_sizze_34352;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38801_backing_0, 4 *
                         mainzigroup_sizze_34352);
    
    int32_t global_tid_34370;
    int32_t local_tid_34371;
    int32_t group_sizze_38799;
    int32_t wave_sizze_38798;
    int32_t group_id_34372;
    
    global_tid_34370 = get_global_id(0);
    local_tid_34371 = get_local_id(0);
    group_sizze_38799 = get_local_size(0);
    wave_sizze_38798 = LOCKSTEP_WIDTH;
    group_id_34372 = get_group_id(0);
    
    int32_t gtid_34344;
    int32_t gtid_34345;
    int32_t gtid_34369;
    __local char *red_arr_mem_38801;
    
    red_arr_mem_38801 = (__local char *) red_arr_mem_38801_backing_0;
    for (int32_t i_38803 = 0; i_38803 < squot32(squot32(sizze_31215 *
                                                        res_31237 +
                                                        squot32(group_sizze_34461,
                                                                segment_sizze_nonzzero_38800) -
                                                        1,
                                                        squot32(group_sizze_34461,
                                                                segment_sizze_nonzzero_38800)) -
                                                group_id_34372 +
                                                num_groups_34471 - 1,
                                                num_groups_34471); i_38803++) {
        gtid_34344 = squot32(squot32(local_tid_34371,
                                     segment_sizze_nonzzero_38800) +
                             (group_id_34372 + i_38803 * num_groups_34471) *
                             squot32(group_sizze_34461,
                                     segment_sizze_nonzzero_38800), res_31237);
        gtid_34345 = squot32(local_tid_34371, segment_sizze_nonzzero_38800) +
            (group_id_34372 + i_38803 * num_groups_34471) *
            squot32(group_sizze_34461, segment_sizze_nonzzero_38800) -
            squot32(squot32(local_tid_34371, segment_sizze_nonzzero_38800) +
                    (group_id_34372 + i_38803 * num_groups_34471) *
                    squot32(group_sizze_34461, segment_sizze_nonzzero_38800),
                    res_31237) * res_31237;
        gtid_34369 = srem32(local_tid_34371, j_m_i_31370);
        // apply map function if in bounds
        {
            if (slt32(0, j_m_i_31370) && ((slt32(gtid_34344, sizze_31215) &&
                                           slt32(gtid_34345, res_31237)) &&
                                          slt32(local_tid_34371, j_m_i_31370 *
                                                squot32(group_sizze_34461,
                                                        segment_sizze_nonzzero_38800)))) {
                int32_t binop_x_37160;
                int32_t binop_x_37161;
                int32_t new_index_37162;
                int32_t binop_y_37168;
                int32_t new_index_37169;
                float x_34483;
                float x_34484;
                float res_34485;
                
                binop_x_37160 = j_m_i_31370 * gtid_34344;
                binop_x_37161 = gtid_34369 + binop_x_37160;
                new_index_37162 = squot32(binop_x_37161, res_31237);
                binop_y_37168 = res_31237 * new_index_37162;
                new_index_37169 = binop_x_37161 - binop_y_37168;
                x_34483 = *(__global float *) &res_mem_38086[(new_index_37162 *
                                                              res_31237 +
                                                              new_index_37169) *
                                                             4];
                x_34484 = *(__global float *) &res_mem_38037[(gtid_34344 *
                                                              (j_m_i_31370 *
                                                               res_31237) +
                                                              gtid_34345 *
                                                              j_m_i_31370 +
                                                              gtid_34369) * 4];
                res_34485 = x_34483 * x_34484;
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_38801[local_tid_34371 * 4] =
                        res_34485;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_38801[local_tid_34371 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, j_m_i_31370)) {
            // perform segmented scan to imitate reduction
            {
                float x_34477;
                float x_34478;
                float x_38804;
                float x_38805;
                int32_t skip_threads_38807;
                
                if (slt32(local_tid_34371, j_m_i_31370 *
                          squot32(group_sizze_34461,
                                  segment_sizze_nonzzero_38800))) {
                    x_34478 = *(volatile __local
                                float *) &red_arr_mem_38801[local_tid_34371 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_38807 = 1;
                    while (slt32(skip_threads_38807, 32)) {
                        if (sle32(skip_threads_38807, local_tid_34371 -
                                  squot32(local_tid_34371, 32) * 32) &&
                            slt32(local_tid_34371, j_m_i_31370 *
                                  squot32(group_sizze_34461,
                                          segment_sizze_nonzzero_38800))) {
                            // read operands
                            {
                                x_34477 = *(volatile __local
                                            float *) &red_arr_mem_38801[(local_tid_34371 -
                                                                         skip_threads_38807) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_34371, j_m_i_31370),
                                           local_tid_34371 - (local_tid_34371 -
                                                              skip_threads_38807))) {
                                    float res_34479 = x_34477 + x_34478;
                                    
                                    x_34478 = res_34479;
                                }
                            }
                        }
                        if (sle32(wave_sizze_38798, skip_threads_38807)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_38807, local_tid_34371 -
                                  squot32(local_tid_34371, 32) * 32) &&
                            slt32(local_tid_34371, j_m_i_31370 *
                                  squot32(group_sizze_34461,
                                          segment_sizze_nonzzero_38800))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_38801[local_tid_34371 *
                                                              sizeof(float)] =
                                    x_34478;
                            }
                        }
                        if (sle32(wave_sizze_38798, skip_threads_38807)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_38807 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_34371 - squot32(local_tid_34371, 32) * 32) ==
                        31 && slt32(local_tid_34371, j_m_i_31370 *
                                    squot32(group_sizze_34461,
                                            segment_sizze_nonzzero_38800))) {
                        *(volatile __local
                          float *) &red_arr_mem_38801[squot32(local_tid_34371,
                                                              32) *
                                                      sizeof(float)] = x_34478;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_38808;
                    
                    if (squot32(local_tid_34371, 32) == 0 &&
                        slt32(local_tid_34371, j_m_i_31370 *
                              squot32(group_sizze_34461,
                                      segment_sizze_nonzzero_38800))) {
                        x_38805 = *(volatile __local
                                    float *) &red_arr_mem_38801[local_tid_34371 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_38808 = 1;
                        while (slt32(skip_threads_38808, 32)) {
                            if (sle32(skip_threads_38808, local_tid_34371 -
                                      squot32(local_tid_34371, 32) * 32) &&
                                (squot32(local_tid_34371, 32) == 0 &&
                                 slt32(local_tid_34371, j_m_i_31370 *
                                       squot32(group_sizze_34461,
                                               segment_sizze_nonzzero_38800)))) {
                                // read operands
                                {
                                    x_38804 = *(volatile __local
                                                float *) &red_arr_mem_38801[(local_tid_34371 -
                                                                             skip_threads_38808) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_34371 * 32 +
                                                      32 - 1, j_m_i_31370),
                                               local_tid_34371 * 32 + 32 - 1 -
                                               ((local_tid_34371 -
                                                 skip_threads_38808) * 32 + 32 -
                                                1))) {
                                        float res_38806 = x_38804 + x_38805;
                                        
                                        x_38805 = res_38806;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_38798, skip_threads_38808)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_38808, local_tid_34371 -
                                      squot32(local_tid_34371, 32) * 32) &&
                                (squot32(local_tid_34371, 32) == 0 &&
                                 slt32(local_tid_34371, j_m_i_31370 *
                                       squot32(group_sizze_34461,
                                               segment_sizze_nonzzero_38800)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_38801[local_tid_34371 *
                                                                  sizeof(float)] =
                                        x_38805;
                                }
                            }
                            if (sle32(wave_sizze_38798, skip_threads_38808)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_38808 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_34371, 32) == 0 ||
                          !slt32(local_tid_34371, j_m_i_31370 *
                                 squot32(group_sizze_34461,
                                         segment_sizze_nonzzero_38800)))) {
                        // read operands
                        {
                            x_34477 = *(volatile __local
                                        float *) &red_arr_mem_38801[(squot32(local_tid_34371,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_34371, j_m_i_31370),
                                       local_tid_34371 -
                                       (squot32(local_tid_34371, 32) * 32 -
                                        1))) {
                                float res_34479 = x_34477 + x_34478;
                                
                                x_34478 = res_34479;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38801[local_tid_34371 *
                                                          sizeof(float)] =
                                x_34478;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_34371, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_38801[local_tid_34371 *
                                                      sizeof(float)] = x_34478;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_34372 + i_38803 * num_groups_34471) *
                      squot32(group_sizze_34461, segment_sizze_nonzzero_38800) +
                      local_tid_34371, sizze_31215 * res_31237) &&
                slt32(local_tid_34371, squot32(group_sizze_34461,
                                               segment_sizze_nonzzero_38800))) {
                *(__global float *) &mem_38138[(squot32((group_id_34372 +
                                                         i_38803 *
                                                         num_groups_34471) *
                                                        squot32(group_sizze_34461,
                                                                segment_sizze_nonzzero_38800) +
                                                        local_tid_34371,
                                                        res_31237) * res_31237 +
                                                ((group_id_34372 + i_38803 *
                                                  num_groups_34471) *
                                                 squot32(group_sizze_34461,
                                                         segment_sizze_nonzzero_38800) +
                                                 local_tid_34371 -
                                                 squot32((group_id_34372 +
                                                          i_38803 *
                                                          num_groups_34471) *
                                                         squot32(group_sizze_34461,
                                                                 segment_sizze_nonzzero_38800) +
                                                         local_tid_34371,
                                                         res_31237) *
                                                 res_31237)) * 4] = *(__local
                                                                      float *) &red_arr_mem_38801[((local_tid_34371 +
                                                                                                    1) *
                                                                                                   segment_sizze_nonzzero_38800 -
                                                                                                   1) *
                                                                                                  4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_34689(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t res_31237, int32_t num_groups_34790,
                                 __global unsigned char *mem_37911, __global
                                 unsigned char *res_mem_38142, __global
                                 unsigned char *mem_38195,
                                 int32_t segment_sizze_nonzzero_38861)
{
    const int32_t group_sizze_34780 = mainzigroup_sizze_34671;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_38862_backing_0, 4 *
                         mainzigroup_sizze_34671);
    
    int32_t global_tid_34689;
    int32_t local_tid_34690;
    int32_t group_sizze_38860;
    int32_t wave_sizze_38859;
    int32_t group_id_34691;
    
    global_tid_34689 = get_global_id(0);
    local_tid_34690 = get_local_id(0);
    group_sizze_38860 = get_local_size(0);
    wave_sizze_38859 = LOCKSTEP_WIDTH;
    group_id_34691 = get_group_id(0);
    
    int32_t gtid_34662;
    int32_t gtid_34663;
    int32_t gtid_34688;
    __local char *red_arr_mem_38862;
    
    red_arr_mem_38862 = (__local char *) red_arr_mem_38862_backing_0;
    for (int32_t i_38864 = 0; i_38864 < squot32(squot32(sizze_31215 *
                                                        sizze_31214 +
                                                        squot32(group_sizze_34780,
                                                                segment_sizze_nonzzero_38861) -
                                                        1,
                                                        squot32(group_sizze_34780,
                                                                segment_sizze_nonzzero_38861)) -
                                                group_id_34691 +
                                                num_groups_34790 - 1,
                                                num_groups_34790); i_38864++) {
        gtid_34662 = squot32(squot32(local_tid_34690,
                                     segment_sizze_nonzzero_38861) +
                             (group_id_34691 + i_38864 * num_groups_34790) *
                             squot32(group_sizze_34780,
                                     segment_sizze_nonzzero_38861),
                             sizze_31214);
        gtid_34663 = squot32(local_tid_34690, segment_sizze_nonzzero_38861) +
            (group_id_34691 + i_38864 * num_groups_34790) *
            squot32(group_sizze_34780, segment_sizze_nonzzero_38861) -
            squot32(squot32(local_tid_34690, segment_sizze_nonzzero_38861) +
                    (group_id_34691 + i_38864 * num_groups_34790) *
                    squot32(group_sizze_34780, segment_sizze_nonzzero_38861),
                    sizze_31214) * sizze_31214;
        gtid_34688 = srem32(local_tid_34690, res_31237);
        // apply map function if in bounds
        {
            if (slt32(0, res_31237) && ((slt32(gtid_34662, sizze_31215) &&
                                         slt32(gtid_34663, sizze_31214)) &&
                                        slt32(local_tid_34690, res_31237 *
                                              squot32(group_sizze_34780,
                                                      segment_sizze_nonzzero_38861)))) {
                float x_34801;
                float x_34802;
                float res_34803;
                
                x_34801 = *(__global float *) &res_mem_38142[(gtid_34662 *
                                                              res_31237 +
                                                              gtid_34688) * 4];
                x_34802 = *(__global float *) &mem_37911[(gtid_34663 *
                                                          res_31237 +
                                                          gtid_34688) * 4];
                res_34803 = x_34801 * x_34802;
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_38862[local_tid_34690 * 4] =
                        res_34803;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_38862[local_tid_34690 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, res_31237)) {
            // perform segmented scan to imitate reduction
            {
                float x_34796;
                float x_34797;
                float x_38865;
                float x_38866;
                int32_t skip_threads_38868;
                
                if (slt32(local_tid_34690, res_31237 *
                          squot32(group_sizze_34780,
                                  segment_sizze_nonzzero_38861))) {
                    x_34797 = *(volatile __local
                                float *) &red_arr_mem_38862[local_tid_34690 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_38868 = 1;
                    while (slt32(skip_threads_38868, 32)) {
                        if (sle32(skip_threads_38868, local_tid_34690 -
                                  squot32(local_tid_34690, 32) * 32) &&
                            slt32(local_tid_34690, res_31237 *
                                  squot32(group_sizze_34780,
                                          segment_sizze_nonzzero_38861))) {
                            // read operands
                            {
                                x_34796 = *(volatile __local
                                            float *) &red_arr_mem_38862[(local_tid_34690 -
                                                                         skip_threads_38868) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_34690, res_31237),
                                           local_tid_34690 - (local_tid_34690 -
                                                              skip_threads_38868))) {
                                    float res_34798 = x_34796 + x_34797;
                                    
                                    x_34797 = res_34798;
                                }
                            }
                        }
                        if (sle32(wave_sizze_38859, skip_threads_38868)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_38868, local_tid_34690 -
                                  squot32(local_tid_34690, 32) * 32) &&
                            slt32(local_tid_34690, res_31237 *
                                  squot32(group_sizze_34780,
                                          segment_sizze_nonzzero_38861))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_38862[local_tid_34690 *
                                                              sizeof(float)] =
                                    x_34797;
                            }
                        }
                        if (sle32(wave_sizze_38859, skip_threads_38868)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_38868 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_34690 - squot32(local_tid_34690, 32) * 32) ==
                        31 && slt32(local_tid_34690, res_31237 *
                                    squot32(group_sizze_34780,
                                            segment_sizze_nonzzero_38861))) {
                        *(volatile __local
                          float *) &red_arr_mem_38862[squot32(local_tid_34690,
                                                              32) *
                                                      sizeof(float)] = x_34797;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_38869;
                    
                    if (squot32(local_tid_34690, 32) == 0 &&
                        slt32(local_tid_34690, res_31237 *
                              squot32(group_sizze_34780,
                                      segment_sizze_nonzzero_38861))) {
                        x_38866 = *(volatile __local
                                    float *) &red_arr_mem_38862[local_tid_34690 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_38869 = 1;
                        while (slt32(skip_threads_38869, 32)) {
                            if (sle32(skip_threads_38869, local_tid_34690 -
                                      squot32(local_tid_34690, 32) * 32) &&
                                (squot32(local_tid_34690, 32) == 0 &&
                                 slt32(local_tid_34690, res_31237 *
                                       squot32(group_sizze_34780,
                                               segment_sizze_nonzzero_38861)))) {
                                // read operands
                                {
                                    x_38865 = *(volatile __local
                                                float *) &red_arr_mem_38862[(local_tid_34690 -
                                                                             skip_threads_38869) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_34690 * 32 +
                                                      32 - 1, res_31237),
                                               local_tid_34690 * 32 + 32 - 1 -
                                               ((local_tid_34690 -
                                                 skip_threads_38869) * 32 + 32 -
                                                1))) {
                                        float res_38867 = x_38865 + x_38866;
                                        
                                        x_38866 = res_38867;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_38859, skip_threads_38869)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_38869, local_tid_34690 -
                                      squot32(local_tid_34690, 32) * 32) &&
                                (squot32(local_tid_34690, 32) == 0 &&
                                 slt32(local_tid_34690, res_31237 *
                                       squot32(group_sizze_34780,
                                               segment_sizze_nonzzero_38861)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_38862[local_tid_34690 *
                                                                  sizeof(float)] =
                                        x_38866;
                                }
                            }
                            if (sle32(wave_sizze_38859, skip_threads_38869)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_38869 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_34690, 32) == 0 ||
                          !slt32(local_tid_34690, res_31237 *
                                 squot32(group_sizze_34780,
                                         segment_sizze_nonzzero_38861)))) {
                        // read operands
                        {
                            x_34796 = *(volatile __local
                                        float *) &red_arr_mem_38862[(squot32(local_tid_34690,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_34690, res_31237),
                                       local_tid_34690 -
                                       (squot32(local_tid_34690, 32) * 32 -
                                        1))) {
                                float res_34798 = x_34796 + x_34797;
                                
                                x_34797 = res_34798;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_38862[local_tid_34690 *
                                                          sizeof(float)] =
                                x_34797;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_34690, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_38862[local_tid_34690 *
                                                      sizeof(float)] = x_34797;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_34691 + i_38864 * num_groups_34790) *
                      squot32(group_sizze_34780, segment_sizze_nonzzero_38861) +
                      local_tid_34690, sizze_31215 * sizze_31214) &&
                slt32(local_tid_34690, squot32(group_sizze_34780,
                                               segment_sizze_nonzzero_38861))) {
                *(__global float *) &mem_38195[(squot32((group_id_34691 +
                                                         i_38864 *
                                                         num_groups_34790) *
                                                        squot32(group_sizze_34780,
                                                                segment_sizze_nonzzero_38861) +
                                                        local_tid_34690,
                                                        sizze_31214) *
                                                sizze_31214 + ((group_id_34691 +
                                                                i_38864 *
                                                                num_groups_34790) *
                                                               squot32(group_sizze_34780,
                                                                       segment_sizze_nonzzero_38861) +
                                                               local_tid_34690 -
                                                               squot32((group_id_34691 +
                                                                        i_38864 *
                                                                        num_groups_34790) *
                                                                       squot32(group_sizze_34780,
                                                                               segment_sizze_nonzzero_38861) +
                                                                       local_tid_34690,
                                                                       sizze_31214) *
                                                               sizze_31214)) *
                                               4] = *(__local
                                                      float *) &red_arr_mem_38862[((local_tid_34690 +
                                                                                    1) *
                                                                                   segment_sizze_nonzzero_38861 -
                                                                                   1) *
                                                                                  4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_35467(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t n_31219, int32_t num_groups_35536,
                                 __global unsigned char *res_mem_38290, __global
                                 unsigned char *mem_38326, __global
                                 unsigned char *mem_38329,
                                 int32_t segment_sizze_nonzzero_39039)
{
    const int32_t group_sizze_35526 = mainzigroup_sizze_35449;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39040_backing_0, 4 *
                         mainzigroup_sizze_35449);
    
    int32_t global_tid_35467;
    int32_t local_tid_35468;
    int32_t group_sizze_39038;
    int32_t wave_sizze_39037;
    int32_t group_id_35469;
    
    global_tid_35467 = get_global_id(0);
    local_tid_35468 = get_local_id(0);
    group_sizze_39038 = get_local_size(0);
    wave_sizze_39037 = LOCKSTEP_WIDTH;
    group_id_35469 = get_group_id(0);
    
    int32_t gtid_35444;
    int32_t gtid_35466;
    __local char *red_arr_mem_39040;
    
    red_arr_mem_39040 = (__local char *) red_arr_mem_39040_backing_0;
    for (int32_t i_39042 = 0; i_39042 < squot32(squot32(sizze_31215 +
                                                        squot32(group_sizze_35526,
                                                                segment_sizze_nonzzero_39039) -
                                                        1,
                                                        squot32(group_sizze_35526,
                                                                segment_sizze_nonzzero_39039)) -
                                                group_id_35469 +
                                                num_groups_35536 - 1,
                                                num_groups_35536); i_39042++) {
        gtid_35444 = squot32(local_tid_35468, segment_sizze_nonzzero_39039) +
            (group_id_35469 + i_39042 * num_groups_35536) *
            squot32(group_sizze_35526, segment_sizze_nonzzero_39039);
        gtid_35466 = srem32(local_tid_35468, n_31219);
        // apply map function if in bounds
        {
            if (slt32(0, n_31219) && (slt32(gtid_35444, sizze_31215) &&
                                      slt32(local_tid_35468, n_31219 *
                                            squot32(group_sizze_35526,
                                                    segment_sizze_nonzzero_39039)))) {
                int32_t res_35546;
                bool cond_35548;
                float res_35549;
                float res_35551;
                
                res_35546 = *(__global int32_t *) &mem_38326[gtid_35444 * 4];
                cond_35548 = slt32(gtid_35466, res_35546);
                if (cond_35548) {
                    float res_35550 = *(__global
                                        float *) &res_mem_38290[(gtid_35444 *
                                                                 sizze_31214 +
                                                                 gtid_35466) *
                                                                4];
                    
                    res_35549 = res_35550;
                } else {
                    res_35549 = 0.0F;
                }
                res_35551 = res_35549 * res_35549;
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_39040[local_tid_35468 * 4] =
                        res_35551;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_39040[local_tid_35468 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, n_31219)) {
            // perform segmented scan to imitate reduction
            {
                float x_35542;
                float x_35543;
                float x_39043;
                float x_39044;
                int32_t skip_threads_39046;
                
                if (slt32(local_tid_35468, n_31219 * squot32(group_sizze_35526,
                                                             segment_sizze_nonzzero_39039))) {
                    x_35543 = *(volatile __local
                                float *) &red_arr_mem_39040[local_tid_35468 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_39046 = 1;
                    while (slt32(skip_threads_39046, 32)) {
                        if (sle32(skip_threads_39046, local_tid_35468 -
                                  squot32(local_tid_35468, 32) * 32) &&
                            slt32(local_tid_35468, n_31219 *
                                  squot32(group_sizze_35526,
                                          segment_sizze_nonzzero_39039))) {
                            // read operands
                            {
                                x_35542 = *(volatile __local
                                            float *) &red_arr_mem_39040[(local_tid_35468 -
                                                                         skip_threads_39046) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_35468, n_31219),
                                           local_tid_35468 - (local_tid_35468 -
                                                              skip_threads_39046))) {
                                    float res_35544 = x_35542 + x_35543;
                                    
                                    x_35543 = res_35544;
                                }
                            }
                        }
                        if (sle32(wave_sizze_39037, skip_threads_39046)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_39046, local_tid_35468 -
                                  squot32(local_tid_35468, 32) * 32) &&
                            slt32(local_tid_35468, n_31219 *
                                  squot32(group_sizze_35526,
                                          segment_sizze_nonzzero_39039))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_39040[local_tid_35468 *
                                                              sizeof(float)] =
                                    x_35543;
                            }
                        }
                        if (sle32(wave_sizze_39037, skip_threads_39046)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_39046 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_35468 - squot32(local_tid_35468, 32) * 32) ==
                        31 && slt32(local_tid_35468, n_31219 *
                                    squot32(group_sizze_35526,
                                            segment_sizze_nonzzero_39039))) {
                        *(volatile __local
                          float *) &red_arr_mem_39040[squot32(local_tid_35468,
                                                              32) *
                                                      sizeof(float)] = x_35543;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_39047;
                    
                    if (squot32(local_tid_35468, 32) == 0 &&
                        slt32(local_tid_35468, n_31219 *
                              squot32(group_sizze_35526,
                                      segment_sizze_nonzzero_39039))) {
                        x_39044 = *(volatile __local
                                    float *) &red_arr_mem_39040[local_tid_35468 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_39047 = 1;
                        while (slt32(skip_threads_39047, 32)) {
                            if (sle32(skip_threads_39047, local_tid_35468 -
                                      squot32(local_tid_35468, 32) * 32) &&
                                (squot32(local_tid_35468, 32) == 0 &&
                                 slt32(local_tid_35468, n_31219 *
                                       squot32(group_sizze_35526,
                                               segment_sizze_nonzzero_39039)))) {
                                // read operands
                                {
                                    x_39043 = *(volatile __local
                                                float *) &red_arr_mem_39040[(local_tid_35468 -
                                                                             skip_threads_39047) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_35468 * 32 +
                                                      32 - 1, n_31219),
                                               local_tid_35468 * 32 + 32 - 1 -
                                               ((local_tid_35468 -
                                                 skip_threads_39047) * 32 + 32 -
                                                1))) {
                                        float res_39045 = x_39043 + x_39044;
                                        
                                        x_39044 = res_39045;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_39037, skip_threads_39047)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_39047, local_tid_35468 -
                                      squot32(local_tid_35468, 32) * 32) &&
                                (squot32(local_tid_35468, 32) == 0 &&
                                 slt32(local_tid_35468, n_31219 *
                                       squot32(group_sizze_35526,
                                               segment_sizze_nonzzero_39039)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_39040[local_tid_35468 *
                                                                  sizeof(float)] =
                                        x_39044;
                                }
                            }
                            if (sle32(wave_sizze_39037, skip_threads_39047)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_39047 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_35468, 32) == 0 ||
                          !slt32(local_tid_35468, n_31219 *
                                 squot32(group_sizze_35526,
                                         segment_sizze_nonzzero_39039)))) {
                        // read operands
                        {
                            x_35542 = *(volatile __local
                                        float *) &red_arr_mem_39040[(squot32(local_tid_35468,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_35468, n_31219),
                                       local_tid_35468 -
                                       (squot32(local_tid_35468, 32) * 32 -
                                        1))) {
                                float res_35544 = x_35542 + x_35543;
                                
                                x_35543 = res_35544;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_39040[local_tid_35468 *
                                                          sizeof(float)] =
                                x_35543;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_35468, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_39040[local_tid_35468 *
                                                      sizeof(float)] = x_35543;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_35469 + i_39042 * num_groups_35536) *
                      squot32(group_sizze_35526, segment_sizze_nonzzero_39039) +
                      local_tid_35468, sizze_31215) && slt32(local_tid_35468,
                                                             squot32(group_sizze_35526,
                                                                     segment_sizze_nonzzero_39039))) {
                *(__global float *) &mem_38329[((group_id_35469 + i_39042 *
                                                 num_groups_35536) *
                                                squot32(group_sizze_35526,
                                                        segment_sizze_nonzzero_39039) +
                                                local_tid_35468) * 4] =
                    *(__local float *) &red_arr_mem_39040[((local_tid_35468 +
                                                            1) *
                                                           segment_sizze_nonzzero_39039 -
                                                           1) * 4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_35492(int32_t sizze_31215, int32_t sizze_31216,
                                 int32_t n_31219, int32_t num_groups_35508,
                                 __global unsigned char *images_mem_37894,
                                 __global unsigned char *mem_38326,
                                 int32_t segment_sizze_nonzzero_39004)
{
    const int32_t group_sizze_35498 = mainzigroup_sizze_35474;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39005_backing_0, 4 *
                         mainzigroup_sizze_35474);
    
    int32_t global_tid_35492;
    int32_t local_tid_35493;
    int32_t group_sizze_39003;
    int32_t wave_sizze_39002;
    int32_t group_id_35494;
    
    global_tid_35492 = get_global_id(0);
    local_tid_35493 = get_local_id(0);
    group_sizze_39003 = get_local_size(0);
    wave_sizze_39002 = LOCKSTEP_WIDTH;
    group_id_35494 = get_group_id(0);
    
    int32_t gtid_35470;
    int32_t gtid_35491;
    __local char *red_arr_mem_39005;
    
    red_arr_mem_39005 = (__local char *) red_arr_mem_39005_backing_0;
    for (int32_t i_39007 = 0; i_39007 < squot32(squot32(sizze_31215 +
                                                        squot32(group_sizze_35498,
                                                                segment_sizze_nonzzero_39004) -
                                                        1,
                                                        squot32(group_sizze_35498,
                                                                segment_sizze_nonzzero_39004)) -
                                                group_id_35494 +
                                                num_groups_35508 - 1,
                                                num_groups_35508); i_39007++) {
        gtid_35470 = squot32(local_tid_35493, segment_sizze_nonzzero_39004) +
            (group_id_35494 + i_39007 * num_groups_35508) *
            squot32(group_sizze_35498, segment_sizze_nonzzero_39004);
        gtid_35491 = srem32(local_tid_35493, n_31219);
        // apply map function if in bounds
        {
            if (slt32(0, n_31219) && (slt32(gtid_35470, sizze_31215) &&
                                      slt32(local_tid_35493, n_31219 *
                                            squot32(group_sizze_35498,
                                                    segment_sizze_nonzzero_39004)))) {
                float x_35518;
                bool res_35519;
                bool cond_35520;
                int32_t res_35521;
                
                x_35518 = *(__global float *) &images_mem_37894[(gtid_35470 *
                                                                 sizze_31216 +
                                                                 gtid_35491) *
                                                                4];
                res_35519 = futrts_isnan32(x_35518);
                cond_35520 = !res_35519;
                if (cond_35520) {
                    res_35521 = 1;
                } else {
                    res_35521 = 0;
                }
                // save results to be reduced
                {
                    *(__local int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                            4] = res_35521;
                }
                // save map-out results
                { }
            } else {
                *(__local int32_t *) &red_arr_mem_39005[local_tid_35493 * 4] =
                    0;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, n_31219)) {
            // perform segmented scan to imitate reduction
            {
                int32_t x_35514;
                int32_t x_35515;
                int32_t x_39008;
                int32_t x_39009;
                int32_t skip_threads_39011;
                
                if (slt32(local_tid_35493, n_31219 * squot32(group_sizze_35498,
                                                             segment_sizze_nonzzero_39004))) {
                    x_35515 = *(volatile __local
                                int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                              sizeof(int32_t)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_39011 = 1;
                    while (slt32(skip_threads_39011, 32)) {
                        if (sle32(skip_threads_39011, local_tid_35493 -
                                  squot32(local_tid_35493, 32) * 32) &&
                            slt32(local_tid_35493, n_31219 *
                                  squot32(group_sizze_35498,
                                          segment_sizze_nonzzero_39004))) {
                            // read operands
                            {
                                x_35514 = *(volatile __local
                                            int32_t *) &red_arr_mem_39005[(local_tid_35493 -
                                                                           skip_threads_39011) *
                                                                          sizeof(int32_t)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_35493, n_31219),
                                           local_tid_35493 - (local_tid_35493 -
                                                              skip_threads_39011))) {
                                    int32_t res_35516 = x_35514 + x_35515;
                                    
                                    x_35515 = res_35516;
                                }
                            }
                        }
                        if (sle32(wave_sizze_39002, skip_threads_39011)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_39011, local_tid_35493 -
                                  squot32(local_tid_35493, 32) * 32) &&
                            slt32(local_tid_35493, n_31219 *
                                  squot32(group_sizze_35498,
                                          segment_sizze_nonzzero_39004))) {
                            // write result
                            {
                                *(volatile __local
                                  int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                                sizeof(int32_t)] =
                                    x_35515;
                            }
                        }
                        if (sle32(wave_sizze_39002, skip_threads_39011)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_39011 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_35493 - squot32(local_tid_35493, 32) * 32) ==
                        31 && slt32(local_tid_35493, n_31219 *
                                    squot32(group_sizze_35498,
                                            segment_sizze_nonzzero_39004))) {
                        *(volatile __local
                          int32_t *) &red_arr_mem_39005[squot32(local_tid_35493,
                                                                32) *
                                                        sizeof(int32_t)] =
                            x_35515;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_39012;
                    
                    if (squot32(local_tid_35493, 32) == 0 &&
                        slt32(local_tid_35493, n_31219 *
                              squot32(group_sizze_35498,
                                      segment_sizze_nonzzero_39004))) {
                        x_39009 = *(volatile __local
                                    int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                                  sizeof(int32_t)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_39012 = 1;
                        while (slt32(skip_threads_39012, 32)) {
                            if (sle32(skip_threads_39012, local_tid_35493 -
                                      squot32(local_tid_35493, 32) * 32) &&
                                (squot32(local_tid_35493, 32) == 0 &&
                                 slt32(local_tid_35493, n_31219 *
                                       squot32(group_sizze_35498,
                                               segment_sizze_nonzzero_39004)))) {
                                // read operands
                                {
                                    x_39008 = *(volatile __local
                                                int32_t *) &red_arr_mem_39005[(local_tid_35493 -
                                                                               skip_threads_39012) *
                                                                              sizeof(int32_t)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_35493 * 32 +
                                                      32 - 1, n_31219),
                                               local_tid_35493 * 32 + 32 - 1 -
                                               ((local_tid_35493 -
                                                 skip_threads_39012) * 32 + 32 -
                                                1))) {
                                        int32_t res_39010 = x_39008 + x_39009;
                                        
                                        x_39009 = res_39010;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_39002, skip_threads_39012)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_39012, local_tid_35493 -
                                      squot32(local_tid_35493, 32) * 32) &&
                                (squot32(local_tid_35493, 32) == 0 &&
                                 slt32(local_tid_35493, n_31219 *
                                       squot32(group_sizze_35498,
                                               segment_sizze_nonzzero_39004)))) {
                                // write result
                                {
                                    *(volatile __local
                                      int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                                    sizeof(int32_t)] =
                                        x_39009;
                                }
                            }
                            if (sle32(wave_sizze_39002, skip_threads_39012)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_39012 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_35493, 32) == 0 ||
                          !slt32(local_tid_35493, n_31219 *
                                 squot32(group_sizze_35498,
                                         segment_sizze_nonzzero_39004)))) {
                        // read operands
                        {
                            x_35514 = *(volatile __local
                                        int32_t *) &red_arr_mem_39005[(squot32(local_tid_35493,
                                                                               32) -
                                                                       1) *
                                                                      sizeof(int32_t)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_35493, n_31219),
                                       local_tid_35493 -
                                       (squot32(local_tid_35493, 32) * 32 -
                                        1))) {
                                int32_t res_35516 = x_35514 + x_35515;
                                
                                x_35515 = res_35516;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                            sizeof(int32_t)] =
                                x_35515;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_35493, 32) == 0) {
                        *(volatile __local
                          int32_t *) &red_arr_mem_39005[local_tid_35493 *
                                                        sizeof(int32_t)] =
                            x_35515;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_35494 + i_39007 * num_groups_35508) *
                      squot32(group_sizze_35498, segment_sizze_nonzzero_39004) +
                      local_tid_35493, sizze_31215) && slt32(local_tid_35493,
                                                             squot32(group_sizze_35498,
                                                                     segment_sizze_nonzzero_39004))) {
                *(__global int32_t *) &mem_38326[((group_id_35494 + i_39007 *
                                                   num_groups_35508) *
                                                  squot32(group_sizze_35498,
                                                          segment_sizze_nonzzero_39004) +
                                                  local_tid_35493) * 4] =
                    *(__local int32_t *) &red_arr_mem_39005[((local_tid_35493 +
                                                              1) *
                                                             segment_sizze_nonzzero_39004 -
                                                             1) * 4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_35723(int32_t sizze_31214, int32_t sizze_31215,
                                 int32_t res_31594, int32_t num_groups_35740,
                                 __global unsigned char *res_mem_38290, __global
                                 unsigned char *res_mem_38339, __global
                                 unsigned char *res_mem_38340, __global
                                 unsigned char *mem_38356,
                                 int32_t segment_sizze_nonzzero_39108)
{
    const int32_t group_sizze_35730 = mainzigroup_sizze_35705;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39109_backing_0, 4 *
                         mainzigroup_sizze_35705);
    
    int32_t global_tid_35723;
    int32_t local_tid_35724;
    int32_t group_sizze_39107;
    int32_t wave_sizze_39106;
    int32_t group_id_35725;
    
    global_tid_35723 = get_global_id(0);
    local_tid_35724 = get_local_id(0);
    group_sizze_39107 = get_local_size(0);
    wave_sizze_39106 = LOCKSTEP_WIDTH;
    group_id_35725 = get_group_id(0);
    
    int32_t gtid_35700;
    int32_t gtid_35722;
    __local char *red_arr_mem_39109;
    
    red_arr_mem_39109 = (__local char *) red_arr_mem_39109_backing_0;
    for (int32_t i_39111 = 0; i_39111 < squot32(squot32(sizze_31215 +
                                                        squot32(group_sizze_35730,
                                                                segment_sizze_nonzzero_39108) -
                                                        1,
                                                        squot32(group_sizze_35730,
                                                                segment_sizze_nonzzero_39108)) -
                                                group_id_35725 +
                                                num_groups_35740 - 1,
                                                num_groups_35740); i_39111++) {
        gtid_35700 = squot32(local_tid_35724, segment_sizze_nonzzero_39108) +
            (group_id_35725 + i_39111 * num_groups_35740) *
            squot32(group_sizze_35730, segment_sizze_nonzzero_39108);
        gtid_35722 = srem32(local_tid_35724, res_31594);
        // apply map function if in bounds
        {
            if (slt32(0, res_31594) && (slt32(gtid_35700, sizze_31215) &&
                                        slt32(local_tid_35724, res_31594 *
                                              squot32(group_sizze_35730,
                                                      segment_sizze_nonzzero_39108)))) {
                int32_t x_35750;
                int32_t x_35751;
                bool cond_35753;
                float res_35754;
                
                x_35750 = *(__global int32_t *) &res_mem_38340[gtid_35700 * 4];
                x_35751 = *(__global int32_t *) &res_mem_38339[gtid_35700 * 4];
                cond_35753 = slt32(gtid_35722, x_35751);
                if (cond_35753) {
                    int32_t x_35755;
                    int32_t x_35756;
                    int32_t i_35757;
                    float res_35758;
                    
                    x_35755 = gtid_35722 + x_35750;
                    x_35756 = x_35755 - x_35751;
                    i_35757 = 1 + x_35756;
                    res_35758 = *(__global float *) &res_mem_38290[(gtid_35700 *
                                                                    sizze_31214 +
                                                                    i_35757) *
                                                                   4];
                    res_35754 = res_35758;
                } else {
                    res_35754 = 0.0F;
                }
                // save results to be reduced
                {
                    *(__local float *) &red_arr_mem_39109[local_tid_35724 * 4] =
                        res_35754;
                }
                // save map-out results
                { }
            } else {
                *(__local float *) &red_arr_mem_39109[local_tid_35724 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, res_31594)) {
            // perform segmented scan to imitate reduction
            {
                float x_35746;
                float x_35747;
                float x_39112;
                float x_39113;
                int32_t skip_threads_39115;
                
                if (slt32(local_tid_35724, res_31594 *
                          squot32(group_sizze_35730,
                                  segment_sizze_nonzzero_39108))) {
                    x_35747 = *(volatile __local
                                float *) &red_arr_mem_39109[local_tid_35724 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_39115 = 1;
                    while (slt32(skip_threads_39115, 32)) {
                        if (sle32(skip_threads_39115, local_tid_35724 -
                                  squot32(local_tid_35724, 32) * 32) &&
                            slt32(local_tid_35724, res_31594 *
                                  squot32(group_sizze_35730,
                                          segment_sizze_nonzzero_39108))) {
                            // read operands
                            {
                                x_35746 = *(volatile __local
                                            float *) &red_arr_mem_39109[(local_tid_35724 -
                                                                         skip_threads_39115) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_35724, res_31594),
                                           local_tid_35724 - (local_tid_35724 -
                                                              skip_threads_39115))) {
                                    float res_35748 = x_35746 + x_35747;
                                    
                                    x_35747 = res_35748;
                                }
                            }
                        }
                        if (sle32(wave_sizze_39106, skip_threads_39115)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_39115, local_tid_35724 -
                                  squot32(local_tid_35724, 32) * 32) &&
                            slt32(local_tid_35724, res_31594 *
                                  squot32(group_sizze_35730,
                                          segment_sizze_nonzzero_39108))) {
                            // write result
                            {
                                *(volatile __local
                                  float *) &red_arr_mem_39109[local_tid_35724 *
                                                              sizeof(float)] =
                                    x_35747;
                            }
                        }
                        if (sle32(wave_sizze_39106, skip_threads_39115)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_39115 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_35724 - squot32(local_tid_35724, 32) * 32) ==
                        31 && slt32(local_tid_35724, res_31594 *
                                    squot32(group_sizze_35730,
                                            segment_sizze_nonzzero_39108))) {
                        *(volatile __local
                          float *) &red_arr_mem_39109[squot32(local_tid_35724,
                                                              32) *
                                                      sizeof(float)] = x_35747;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_39116;
                    
                    if (squot32(local_tid_35724, 32) == 0 &&
                        slt32(local_tid_35724, res_31594 *
                              squot32(group_sizze_35730,
                                      segment_sizze_nonzzero_39108))) {
                        x_39113 = *(volatile __local
                                    float *) &red_arr_mem_39109[local_tid_35724 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_39116 = 1;
                        while (slt32(skip_threads_39116, 32)) {
                            if (sle32(skip_threads_39116, local_tid_35724 -
                                      squot32(local_tid_35724, 32) * 32) &&
                                (squot32(local_tid_35724, 32) == 0 &&
                                 slt32(local_tid_35724, res_31594 *
                                       squot32(group_sizze_35730,
                                               segment_sizze_nonzzero_39108)))) {
                                // read operands
                                {
                                    x_39112 = *(volatile __local
                                                float *) &red_arr_mem_39109[(local_tid_35724 -
                                                                             skip_threads_39116) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_35724 * 32 +
                                                      32 - 1, res_31594),
                                               local_tid_35724 * 32 + 32 - 1 -
                                               ((local_tid_35724 -
                                                 skip_threads_39116) * 32 + 32 -
                                                1))) {
                                        float res_39114 = x_39112 + x_39113;
                                        
                                        x_39113 = res_39114;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_39106, skip_threads_39116)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_39116, local_tid_35724 -
                                      squot32(local_tid_35724, 32) * 32) &&
                                (squot32(local_tid_35724, 32) == 0 &&
                                 slt32(local_tid_35724, res_31594 *
                                       squot32(group_sizze_35730,
                                               segment_sizze_nonzzero_39108)))) {
                                // write result
                                {
                                    *(volatile __local
                                      float *) &red_arr_mem_39109[local_tid_35724 *
                                                                  sizeof(float)] =
                                        x_39113;
                                }
                            }
                            if (sle32(wave_sizze_39106, skip_threads_39116)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_39116 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_35724, 32) == 0 ||
                          !slt32(local_tid_35724, res_31594 *
                                 squot32(group_sizze_35730,
                                         segment_sizze_nonzzero_39108)))) {
                        // read operands
                        {
                            x_35746 = *(volatile __local
                                        float *) &red_arr_mem_39109[(squot32(local_tid_35724,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_35724, res_31594),
                                       local_tid_35724 -
                                       (squot32(local_tid_35724, 32) * 32 -
                                        1))) {
                                float res_35748 = x_35746 + x_35747;
                                
                                x_35747 = res_35748;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              float *) &red_arr_mem_39109[local_tid_35724 *
                                                          sizeof(float)] =
                                x_35747;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_35724, 32) == 0) {
                        *(volatile __local
                          float *) &red_arr_mem_39109[local_tid_35724 *
                                                      sizeof(float)] = x_35747;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_35725 + i_39111 * num_groups_35740) *
                      squot32(group_sizze_35730, segment_sizze_nonzzero_39108) +
                      local_tid_35724, sizze_31215) && slt32(local_tid_35724,
                                                             squot32(group_sizze_35730,
                                                                     segment_sizze_nonzzero_39108))) {
                *(__global float *) &mem_38356[((group_id_35725 + i_39111 *
                                                 num_groups_35740) *
                                                squot32(group_sizze_35730,
                                                        segment_sizze_nonzzero_39108) +
                                                local_tid_35724) * 4] =
                    *(__local float *) &red_arr_mem_39109[((local_tid_35724 +
                                                            1) *
                                                           segment_sizze_nonzzero_39108 -
                                                           1) * 4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
__kernel void segred_small_36423(int32_t sizze_31215, int32_t arg_31616,
                                 int32_t num_groups_36592, __global
                                 unsigned char *mem_38361, __global
                                 unsigned char *mem_38434, __global
                                 unsigned char *mem_38437, __global
                                 unsigned char *mem_38441, __global
                                 unsigned char *mem_38443, __global
                                 unsigned char *mem_38446, __global
                                 unsigned char *mem_38449, __global
                                 unsigned char *mem_38453,
                                 int32_t segment_sizze_nonzzero_39225)
{
    const int32_t group_sizze_36582 = mainzigroup_sizze_36405;
    const int block_dim0 = 0;
    const int block_dim1 = 1;
    const int block_dim2 = 2;
    
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39226_backing_0, mainzigroup_sizze_36405);
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39228_backing_1, 4 *
                         mainzigroup_sizze_36405);
    ALIGNED_LOCAL_MEMORY(red_arr_mem_39230_backing_2, 4 *
                         mainzigroup_sizze_36405);
    
    int32_t global_tid_36423;
    int32_t local_tid_36424;
    int32_t group_sizze_39224;
    int32_t wave_sizze_39223;
    int32_t group_id_36425;
    
    global_tid_36423 = get_global_id(0);
    local_tid_36424 = get_local_id(0);
    group_sizze_39224 = get_local_size(0);
    wave_sizze_39223 = LOCKSTEP_WIDTH;
    group_id_36425 = get_group_id(0);
    
    int32_t gtid_36399;
    int32_t gtid_36422;
    __local char *red_arr_mem_39226;
    
    red_arr_mem_39226 = (__local char *) red_arr_mem_39226_backing_0;
    
    __local char *red_arr_mem_39228;
    
    red_arr_mem_39228 = (__local char *) red_arr_mem_39228_backing_1;
    
    __local char *red_arr_mem_39230;
    
    red_arr_mem_39230 = (__local char *) red_arr_mem_39230_backing_2;
    for (int32_t i_39232 = 0; i_39232 < squot32(squot32(sizze_31215 +
                                                        squot32(group_sizze_36582,
                                                                segment_sizze_nonzzero_39225) -
                                                        1,
                                                        squot32(group_sizze_36582,
                                                                segment_sizze_nonzzero_39225)) -
                                                group_id_36425 +
                                                num_groups_36592 - 1,
                                                num_groups_36592); i_39232++) {
        gtid_36399 = squot32(local_tid_36424, segment_sizze_nonzzero_39225) +
            (group_id_36425 + i_39232 * num_groups_36592) *
            squot32(group_sizze_36582, segment_sizze_nonzzero_39225);
        gtid_36422 = srem32(local_tid_36424, arg_31616);
        // apply map function if in bounds
        {
            if (slt32(0, arg_31616) && (slt32(gtid_36399, sizze_31215) &&
                                        slt32(local_tid_36424, arg_31616 *
                                              squot32(group_sizze_36582,
                                                      segment_sizze_nonzzero_39225)))) {
                int32_t y_36614;
                float y_36615;
                float x_36619;
                float x_36620;
                float res_36623;
                bool cond_36624;
                bool res_36625;
                bool res_36626;
                bool x_36627;
                float res_36628;
                bool res_36629;
                bool x_36630;
                float res_36631;
                
                y_36614 = *(__global int32_t *) &mem_38437[gtid_36399 * 4];
                y_36615 = *(__global float *) &mem_38434[gtid_36399 * 4];
                x_36619 = *(__global float *) &mem_38441[(gtid_36399 *
                                                          arg_31616 +
                                                          gtid_36422) * 4];
                x_36620 = *(__global float *) &mem_38361[gtid_36422 * 4];
                res_36623 = x_36619 / y_36615;
                cond_36624 = slt32(gtid_36422, y_36614);
                res_36625 = futrts_isnan32(res_36623);
                res_36626 = !res_36625;
                x_36627 = cond_36624 && res_36626;
                res_36628 = (float) fabs(res_36623);
                res_36629 = x_36620 < res_36628;
                x_36630 = x_36627 && res_36629;
                if (cond_36624) {
                    res_36631 = res_36623;
                } else {
                    res_36631 = 0.0F;
                }
                // save results to be reduced
                {
                    *(__local bool *) &red_arr_mem_39226[local_tid_36424] =
                        x_36630;
                    *(__local int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                            4] = gtid_36422;
                    *(__local float *) &red_arr_mem_39230[local_tid_36424 * 4] =
                        res_36631;
                }
                // save map-out results
                {
                    if (1) {
                        *(__global float *) &mem_38453[(gtid_36399 * arg_31616 +
                                                        gtid_36422) * 4] =
                            res_36623;
                    }
                }
            } else {
                *(__local bool *) &red_arr_mem_39226[local_tid_36424] = 0;
                *(__local int32_t *) &red_arr_mem_39228[local_tid_36424 * 4] =
                    -1;
                *(__local float *) &red_arr_mem_39230[local_tid_36424 * 4] =
                    0.0F;
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (slt32(0, arg_31616)) {
            // perform segmented scan to imitate reduction
            {
                bool x_36601;
                int32_t x_36602;
                float x_36603;
                bool x_36604;
                int32_t x_36605;
                float x_36606;
                bool x_39233;
                int32_t x_39234;
                float x_39235;
                bool x_39236;
                int32_t x_39237;
                float x_39238;
                int32_t skip_threads_39246;
                
                if (slt32(local_tid_36424, arg_31616 *
                          squot32(group_sizze_36582,
                                  segment_sizze_nonzzero_39225))) {
                    x_36604 = *(volatile __local
                                bool *) &red_arr_mem_39226[local_tid_36424 *
                                                           sizeof(bool)];
                    x_36605 = *(volatile __local
                                int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                              sizeof(int32_t)];
                    x_36606 = *(volatile __local
                                float *) &red_arr_mem_39230[local_tid_36424 *
                                                            sizeof(float)];
                }
                // in-block scan (hopefully no barriers needed)
                {
                    skip_threads_39246 = 1;
                    while (slt32(skip_threads_39246, 32)) {
                        if (sle32(skip_threads_39246, local_tid_36424 -
                                  squot32(local_tid_36424, 32) * 32) &&
                            slt32(local_tid_36424, arg_31616 *
                                  squot32(group_sizze_36582,
                                          segment_sizze_nonzzero_39225))) {
                            // read operands
                            {
                                x_36601 = *(volatile __local
                                            bool *) &red_arr_mem_39226[(local_tid_36424 -
                                                                        skip_threads_39246) *
                                                                       sizeof(bool)];
                                x_36602 = *(volatile __local
                                            int32_t *) &red_arr_mem_39228[(local_tid_36424 -
                                                                           skip_threads_39246) *
                                                                          sizeof(int32_t)];
                                x_36603 = *(volatile __local
                                            float *) &red_arr_mem_39230[(local_tid_36424 -
                                                                         skip_threads_39246) *
                                                                        sizeof(float)];
                            }
                            // perform operation
                            {
                                if (!slt32(srem32(local_tid_36424, arg_31616),
                                           local_tid_36424 - (local_tid_36424 -
                                                              skip_threads_39246))) {
                                    bool res_36607;
                                    int32_t res_36608;
                                    float res_36613;
                                    
                                    if (x_36601) {
                                        res_36607 = x_36601;
                                        res_36608 = x_36602;
                                    } else {
                                        bool x_36609;
                                        bool y_36610;
                                        bool res_36611;
                                        int32_t res_36612;
                                        
                                        x_36609 = !x_36604;
                                        y_36610 = x_36601 && x_36609;
                                        res_36611 = x_36604 || y_36610;
                                        if (x_36604) {
                                            res_36612 = x_36605;
                                        } else {
                                            res_36612 = x_36602;
                                        }
                                        res_36607 = res_36611;
                                        res_36608 = res_36612;
                                    }
                                    res_36613 = x_36603 + x_36606;
                                    x_36604 = res_36607;
                                    x_36605 = res_36608;
                                    x_36606 = res_36613;
                                }
                            }
                        }
                        if (sle32(wave_sizze_39223, skip_threads_39246)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        if (sle32(skip_threads_39246, local_tid_36424 -
                                  squot32(local_tid_36424, 32) * 32) &&
                            slt32(local_tid_36424, arg_31616 *
                                  squot32(group_sizze_36582,
                                          segment_sizze_nonzzero_39225))) {
                            // write result
                            {
                                *(volatile __local
                                  bool *) &red_arr_mem_39226[local_tid_36424 *
                                                             sizeof(bool)] =
                                    x_36604;
                                *(volatile __local
                                  int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                                sizeof(int32_t)] =
                                    x_36605;
                                *(volatile __local
                                  float *) &red_arr_mem_39230[local_tid_36424 *
                                                              sizeof(float)] =
                                    x_36606;
                            }
                        }
                        if (sle32(wave_sizze_39223, skip_threads_39246)) {
                            barrier(CLK_LOCAL_MEM_FENCE);
                        }
                        skip_threads_39246 *= 2;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // last thread of block 'i' writes its result to offset 'i'
                {
                    if ((local_tid_36424 - squot32(local_tid_36424, 32) * 32) ==
                        31 && slt32(local_tid_36424, arg_31616 *
                                    squot32(group_sizze_36582,
                                            segment_sizze_nonzzero_39225))) {
                        *(volatile __local
                          bool *) &red_arr_mem_39226[squot32(local_tid_36424,
                                                             32) *
                                                     sizeof(bool)] = x_36604;
                        *(volatile __local
                          int32_t *) &red_arr_mem_39228[squot32(local_tid_36424,
                                                                32) *
                                                        sizeof(int32_t)] =
                            x_36605;
                        *(volatile __local
                          float *) &red_arr_mem_39230[squot32(local_tid_36424,
                                                              32) *
                                                      sizeof(float)] = x_36606;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // scan the first block, after which offset 'i' contains carry-in for warp 'i+1'
                {
                    int32_t skip_threads_39247;
                    
                    if (squot32(local_tid_36424, 32) == 0 &&
                        slt32(local_tid_36424, arg_31616 *
                              squot32(group_sizze_36582,
                                      segment_sizze_nonzzero_39225))) {
                        x_39236 = *(volatile __local
                                    bool *) &red_arr_mem_39226[local_tid_36424 *
                                                               sizeof(bool)];
                        x_39237 = *(volatile __local
                                    int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                                  sizeof(int32_t)];
                        x_39238 = *(volatile __local
                                    float *) &red_arr_mem_39230[local_tid_36424 *
                                                                sizeof(float)];
                    }
                    // in-block scan (hopefully no barriers needed)
                    {
                        skip_threads_39247 = 1;
                        while (slt32(skip_threads_39247, 32)) {
                            if (sle32(skip_threads_39247, local_tid_36424 -
                                      squot32(local_tid_36424, 32) * 32) &&
                                (squot32(local_tid_36424, 32) == 0 &&
                                 slt32(local_tid_36424, arg_31616 *
                                       squot32(group_sizze_36582,
                                               segment_sizze_nonzzero_39225)))) {
                                // read operands
                                {
                                    x_39233 = *(volatile __local
                                                bool *) &red_arr_mem_39226[(local_tid_36424 -
                                                                            skip_threads_39247) *
                                                                           sizeof(bool)];
                                    x_39234 = *(volatile __local
                                                int32_t *) &red_arr_mem_39228[(local_tid_36424 -
                                                                               skip_threads_39247) *
                                                                              sizeof(int32_t)];
                                    x_39235 = *(volatile __local
                                                float *) &red_arr_mem_39230[(local_tid_36424 -
                                                                             skip_threads_39247) *
                                                                            sizeof(float)];
                                }
                                // perform operation
                                {
                                    if (!slt32(srem32(local_tid_36424 * 32 +
                                                      32 - 1, arg_31616),
                                               local_tid_36424 * 32 + 32 - 1 -
                                               ((local_tid_36424 -
                                                 skip_threads_39247) * 32 + 32 -
                                                1))) {
                                        bool res_39239;
                                        int32_t res_39240;
                                        float res_39245;
                                        
                                        if (x_39233) {
                                            res_39239 = x_39233;
                                            res_39240 = x_39234;
                                        } else {
                                            bool x_39241;
                                            bool y_39242;
                                            bool res_39243;
                                            int32_t res_39244;
                                            
                                            x_39241 = !x_39236;
                                            y_39242 = x_39233 && x_39241;
                                            res_39243 = x_39236 || y_39242;
                                            if (x_39236) {
                                                res_39244 = x_39237;
                                            } else {
                                                res_39244 = x_39234;
                                            }
                                            res_39239 = res_39243;
                                            res_39240 = res_39244;
                                        }
                                        res_39245 = x_39235 + x_39238;
                                        x_39236 = res_39239;
                                        x_39237 = res_39240;
                                        x_39238 = res_39245;
                                    }
                                }
                            }
                            if (sle32(wave_sizze_39223, skip_threads_39247)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            if (sle32(skip_threads_39247, local_tid_36424 -
                                      squot32(local_tid_36424, 32) * 32) &&
                                (squot32(local_tid_36424, 32) == 0 &&
                                 slt32(local_tid_36424, arg_31616 *
                                       squot32(group_sizze_36582,
                                               segment_sizze_nonzzero_39225)))) {
                                // write result
                                {
                                    *(volatile __local
                                      bool *) &red_arr_mem_39226[local_tid_36424 *
                                                                 sizeof(bool)] =
                                        x_39236;
                                    *(volatile __local
                                      int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                                    sizeof(int32_t)] =
                                        x_39237;
                                    *(volatile __local
                                      float *) &red_arr_mem_39230[local_tid_36424 *
                                                                  sizeof(float)] =
                                        x_39238;
                                }
                            }
                            if (sle32(wave_sizze_39223, skip_threads_39247)) {
                                barrier(CLK_LOCAL_MEM_FENCE);
                            }
                            skip_threads_39247 *= 2;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // carry-in for every block except the first
                {
                    if (!(squot32(local_tid_36424, 32) == 0 ||
                          !slt32(local_tid_36424, arg_31616 *
                                 squot32(group_sizze_36582,
                                         segment_sizze_nonzzero_39225)))) {
                        // read operands
                        {
                            x_36601 = *(volatile __local
                                        bool *) &red_arr_mem_39226[(squot32(local_tid_36424,
                                                                            32) -
                                                                    1) *
                                                                   sizeof(bool)];
                            x_36602 = *(volatile __local
                                        int32_t *) &red_arr_mem_39228[(squot32(local_tid_36424,
                                                                               32) -
                                                                       1) *
                                                                      sizeof(int32_t)];
                            x_36603 = *(volatile __local
                                        float *) &red_arr_mem_39230[(squot32(local_tid_36424,
                                                                             32) -
                                                                     1) *
                                                                    sizeof(float)];
                        }
                        // perform operation
                        {
                            if (!slt32(srem32(local_tid_36424, arg_31616),
                                       local_tid_36424 -
                                       (squot32(local_tid_36424, 32) * 32 -
                                        1))) {
                                bool res_36607;
                                int32_t res_36608;
                                float res_36613;
                                
                                if (x_36601) {
                                    res_36607 = x_36601;
                                    res_36608 = x_36602;
                                } else {
                                    bool x_36609;
                                    bool y_36610;
                                    bool res_36611;
                                    int32_t res_36612;
                                    
                                    x_36609 = !x_36604;
                                    y_36610 = x_36601 && x_36609;
                                    res_36611 = x_36604 || y_36610;
                                    if (x_36604) {
                                        res_36612 = x_36605;
                                    } else {
                                        res_36612 = x_36602;
                                    }
                                    res_36607 = res_36611;
                                    res_36608 = res_36612;
                                }
                                res_36613 = x_36603 + x_36606;
                                x_36604 = res_36607;
                                x_36605 = res_36608;
                                x_36606 = res_36613;
                            }
                        }
                        // write final result
                        {
                            *(volatile __local
                              bool *) &red_arr_mem_39226[local_tid_36424 *
                                                         sizeof(bool)] =
                                x_36604;
                            *(volatile __local
                              int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                            sizeof(int32_t)] =
                                x_36605;
                            *(volatile __local
                              float *) &red_arr_mem_39230[local_tid_36424 *
                                                          sizeof(float)] =
                                x_36606;
                        }
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
                // restore correct values for first block
                {
                    if (squot32(local_tid_36424, 32) == 0) {
                        *(volatile __local
                          bool *) &red_arr_mem_39226[local_tid_36424 *
                                                     sizeof(bool)] = x_36604;
                        *(volatile __local
                          int32_t *) &red_arr_mem_39228[local_tid_36424 *
                                                        sizeof(int32_t)] =
                            x_36605;
                        *(volatile __local
                          float *) &red_arr_mem_39230[local_tid_36424 *
                                                      sizeof(float)] = x_36606;
                    }
                }
                barrier(CLK_LOCAL_MEM_FENCE);
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // save final values of segments
        {
            if (slt32((group_id_36425 + i_39232 * num_groups_36592) *
                      squot32(group_sizze_36582, segment_sizze_nonzzero_39225) +
                      local_tid_36424, sizze_31215) && slt32(local_tid_36424,
                                                             squot32(group_sizze_36582,
                                                                     segment_sizze_nonzzero_39225))) {
                *(__global bool *) &mem_38443[(group_id_36425 + i_39232 *
                                               num_groups_36592) *
                                              squot32(group_sizze_36582,
                                                      segment_sizze_nonzzero_39225) +
                                              local_tid_36424] = *(__local
                                                                   bool *) &red_arr_mem_39226[(local_tid_36424 +
                                                                                               1) *
                                                                                              segment_sizze_nonzzero_39225 -
                                                                                              1];
                *(__global int32_t *) &mem_38446[((group_id_36425 + i_39232 *
                                                   num_groups_36592) *
                                                  squot32(group_sizze_36582,
                                                          segment_sizze_nonzzero_39225) +
                                                  local_tid_36424) * 4] =
                    *(__local int32_t *) &red_arr_mem_39228[((local_tid_36424 +
                                                              1) *
                                                             segment_sizze_nonzzero_39225 -
                                                             1) * 4];
                *(__global float *) &mem_38449[((group_id_36425 + i_39232 *
                                                 num_groups_36592) *
                                                squot32(group_sizze_36582,
                                                        segment_sizze_nonzzero_39225) +
                                                local_tid_36424) * 4] =
                    *(__local float *) &red_arr_mem_39230[((local_tid_36424 +
                                                            1) *
                                                           segment_sizze_nonzzero_39225 -
                                                           1) * 4];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}
"""
# Hacky parser/reader/writer for values written in Futhark syntax.
# Used for reading stdin when compiling standalone programs with the
# Python code generator.

import numpy as np
import string
import struct
import sys

class ReaderInput:
    def __init__(self, f):
        self.f = f
        self.lookahead_buffer = []

    def get_char(self):
        if len(self.lookahead_buffer) == 0:
            return self.f.read(1)
        else:
            c = self.lookahead_buffer[0]
            self.lookahead_buffer = self.lookahead_buffer[1:]
            return c

    def unget_char(self, c):
        self.lookahead_buffer = [c] + self.lookahead_buffer

    def get_chars(self, n):
        n1 = min(n, len(self.lookahead_buffer))
        s = b''.join(self.lookahead_buffer[:n1])
        self.lookahead_buffer = self.lookahead_buffer[n1:]
        n2 = n - n1
        if n2 > 0:
            s += self.f.read(n2)
        return s

    def peek_char(self):
        c = self.get_char()
        if c:
            self.unget_char(c)
        return c

def skip_spaces(f):
    c = f.get_char()
    while c != None:
        if c.isspace():
            c = f.get_char()
        elif c == b'-':
          # May be line comment.
          if f.peek_char() == b'-':
            # Yes, line comment. Skip to end of line.
            while (c != b'\n' and c != None):
              c = f.get_char()
          else:
            break
        else:
          break
    if c:
        f.unget_char(c)

def parse_specific_char(f, expected):
    got = f.get_char()
    if got != expected:
        f.unget_char(got)
        raise ValueError
    return True

def parse_specific_string(f, s):
    # This funky mess is intended, and is caused by the fact that if `type(b) ==
    # bytes` then `type(b[0]) == int`, but we need to match each element with a
    # `bytes`, so therefore we make each character an array element
    b = s.encode('utf8')
    bs = [b[i:i+1] for i in range(len(b))]
    read = []
    try:
        for c in bs:
            parse_specific_char(f, c)
            read.append(c)
        return True
    except ValueError:
        for c in read[::-1]:
            f.unget_char(c)
        raise

def optional(p, *args):
    try:
        return p(*args)
    except ValueError:
        return None

def optional_specific_string(f, s):
    c = f.peek_char()
    # This funky mess is intended, and is caused by the fact that if `type(b) ==
    # bytes` then `type(b[0]) == int`, but we need to match each element with a
    # `bytes`, so therefore we make each character an array element
    b = s.encode('utf8')
    bs = [b[i:i+1] for i in range(len(b))]
    if c == bs[0]:
        return parse_specific_string(f, s)
    else:
        return False

def sepBy(p, sep, *args):
    elems = []
    x = optional(p, *args)
    if x != None:
        elems += [x]
        while optional(sep, *args) != None:
            x = p(*args)
            elems += [x]
    return elems

# Assumes '0x' has already been read
def parse_hex_int(f):
    s = b''
    c = f.get_char()
    while c != None:
        if c in b'01234556789ABCDEFabcdef':
            s += c
            c = f.get_char()
        elif c == b'_':
            c = f.get_char() # skip _
        else:
            f.unget_char(c)
            break
    return str(int(s, 16)).encode('utf8') # ugh

def parse_int(f):
    s = b''
    c = f.get_char()
    if c == b'0' and f.peek_char() in b'xX':
        c = f.get_char() # skip X
        return parse_hex_int(f)
    else:
        while c != None:
            if c.isdigit():
                s += c
                c = f.get_char()
            elif c == b'_':
                c = f.get_char() # skip _
            else:
                f.unget_char(c)
                break
        if len(s) == 0:
            raise ValueError
        return s

def parse_int_signed(f):
    s = b''
    c = f.get_char()

    if c == b'-' and f.peek_char().isdigit():
      return c + parse_int(f)
    else:
      if c != b'+':
          f.unget_char(c)
      return parse_int(f)

def read_str_comma(f):
    skip_spaces(f)
    parse_specific_char(f, b',')
    return b','

def read_str_int(f, s):
    skip_spaces(f)
    x = int(parse_int_signed(f))
    optional_specific_string(f, s)
    return x

def read_str_uint(f, s):
    skip_spaces(f)
    x = int(parse_int(f))
    optional_specific_string(f, s)
    return x

def read_str_i8(f):
    return np.int8(read_str_int(f, 'i8'))
def read_str_i16(f):
    return np.int16(read_str_int(f, 'i16'))
def read_str_i32(f):
    return np.int32(read_str_int(f, 'i32'))
def read_str_i64(f):
    return np.int64(read_str_int(f, 'i64'))

def read_str_u8(f):
    return np.uint8(read_str_int(f, 'u8'))
def read_str_u16(f):
    return np.uint16(read_str_int(f, 'u16'))
def read_str_u32(f):
    return np.uint32(read_str_int(f, 'u32'))
def read_str_u64(f):
    return np.uint64(read_str_int(f, 'u64'))

def read_char(f):
    skip_spaces(f)
    parse_specific_char(f, b'\'')
    c = f.get_char()
    parse_specific_char(f, b'\'')
    return c

def read_str_hex_float(f, sign):
    int_part = parse_hex_int(f)
    parse_specific_char(f, b'.')
    frac_part = parse_hex_int(f)
    parse_specific_char(f, b'p')
    exponent = parse_int(f)

    int_val = int(int_part, 16)
    frac_val = float(int(frac_part, 16)) / (16 ** len(frac_part))
    exp_val = int(exponent)

    total_val = (int_val + frac_val) * (2.0 ** exp_val)
    if sign == b'-':
        total_val = -1 * total_val

    return float(total_val)


def read_str_decimal(f):
    skip_spaces(f)
    c = f.get_char()
    if (c == b'-'):
      sign = b'-'
    else:
      f.unget_char(c)
      sign = b''

    # Check for hexadecimal float
    c = f.get_char()
    if (c == '0' and (f.peek_char() in ['x', 'X'])):
        f.get_char()
        return read_str_hex_float(f, sign)
    else:
        f.unget_char(c)

    bef = optional(parse_int, f)
    if bef == None:
        bef = b'0'
        parse_specific_char(f, b'.')
        aft = parse_int(f)
    elif optional(parse_specific_char, f, b'.'):
        aft = parse_int(f)
    else:
        aft = b'0'
    if (optional(parse_specific_char, f, b'E') or
        optional(parse_specific_char, f, b'e')):
        expt = parse_int_signed(f)
    else:
        expt = b'0'
    return float(sign + bef + b'.' + aft + b'E' + expt)

def read_str_f32(f):
    skip_spaces(f)
    try:
        parse_specific_string(f, 'f32.nan')
        return np.float32(np.nan)
    except ValueError:
        try:
            parse_specific_string(f, 'f32.inf')
            return np.float32(np.inf)
        except ValueError:
            try:
               parse_specific_string(f, '-f32.inf')
               return np.float32(-np.inf)
            except ValueError:
               x = read_str_decimal(f)
               optional_specific_string(f, 'f32')
               return x

def read_str_f64(f):
    skip_spaces(f)
    try:
        parse_specific_string(f, 'f64.nan')
        return np.float64(np.nan)
    except ValueError:
        try:
            parse_specific_string(f, 'f64.inf')
            return np.float64(np.inf)
        except ValueError:
            try:
               parse_specific_string(f, '-f64.inf')
               return np.float64(-np.inf)
            except ValueError:
               x = read_str_decimal(f)
               optional_specific_string(f, 'f64')
               return x

def read_str_bool(f):
    skip_spaces(f)
    if f.peek_char() == b't':
        parse_specific_string(f, 'true')
        return True
    elif f.peek_char() == b'f':
        parse_specific_string(f, 'false')
        return False
    else:
        raise ValueError

def read_str_empty_array(f, type_name, rank):
    parse_specific_string(f, 'empty')
    parse_specific_char(f, b'(')
    for i in range(rank):
        parse_specific_string(f, '[]')
    parse_specific_string(f, type_name)
    parse_specific_char(f, b')')

    return None

def read_str_array_elems(f, elem_reader, type_name, rank):
    skip_spaces(f)
    try:
        parse_specific_char(f, b'[')
    except ValueError:
        return read_str_empty_array(f, type_name, rank)
    else:
        xs = sepBy(elem_reader, read_str_comma, f)
        skip_spaces(f)
        parse_specific_char(f, b']')
        return xs

def read_str_array_helper(f, elem_reader, type_name, rank):
    def nested_row_reader(_):
        return read_str_array_helper(f, elem_reader, type_name, rank-1)
    if rank == 1:
        row_reader = elem_reader
    else:
        row_reader = nested_row_reader
    return read_str_array_elems(f, row_reader, type_name, rank-1)

def expected_array_dims(l, rank):
  if rank > 1:
      n = len(l)
      if n == 0:
          elem = []
      else:
          elem = l[0]
      return [n] + expected_array_dims(elem, rank-1)
  else:
      return [len(l)]

def verify_array_dims(l, dims):
    if dims[0] != len(l):
        raise ValueError
    if len(dims) > 1:
        for x in l:
            verify_array_dims(x, dims[1:])

def read_str_array(f, elem_reader, type_name, rank, bt):
    elems = read_str_array_helper(f, elem_reader, type_name, rank)
    if elems == None:
        # Empty array
        return np.empty([0]*rank, dtype=bt)
    else:
        dims = expected_array_dims(elems, rank)
        verify_array_dims(elems, dims)
        return np.array(elems, dtype=bt)

################################################################################

READ_BINARY_VERSION = 2

# struct format specified at
# https://docs.python.org/2/library/struct.html#format-characters

def mk_bin_scalar_reader(t):
    def bin_reader(f):
        fmt = FUTHARK_PRIMTYPES[t]['bin_format']
        size = FUTHARK_PRIMTYPES[t]['size']
        return struct.unpack('<' + fmt, f.get_chars(size))[0]
    return bin_reader

read_bin_i8 = mk_bin_scalar_reader('i8')
read_bin_i16 = mk_bin_scalar_reader('i16')
read_bin_i32 = mk_bin_scalar_reader('i32')
read_bin_i64 = mk_bin_scalar_reader('i64')

read_bin_u8 = mk_bin_scalar_reader('u8')
read_bin_u16 = mk_bin_scalar_reader('u16')
read_bin_u32 = mk_bin_scalar_reader('u32')
read_bin_u64 = mk_bin_scalar_reader('u64')

read_bin_f32 = mk_bin_scalar_reader('f32')
read_bin_f64 = mk_bin_scalar_reader('f64')

read_bin_bool = mk_bin_scalar_reader('bool')

def read_is_binary(f):
    skip_spaces(f)
    c = f.get_char()
    if c == b'b':
        bin_version = read_bin_u8(f)
        if bin_version != READ_BINARY_VERSION:
            panic(1, "binary-input: File uses version %i, but I only understand version %i.\n",
                  bin_version, READ_BINARY_VERSION)
        return True
    else:
        f.unget_char(c)
        return False

FUTHARK_PRIMTYPES = {
    'i8':  {'binname' : b"  i8",
            'size' : 1,
            'bin_reader': read_bin_i8,
            'str_reader': read_str_i8,
            'bin_format': 'b',
            'numpy_type': np.int8 },

    'i16': {'binname' : b" i16",
            'size' : 2,
            'bin_reader': read_bin_i16,
            'str_reader': read_str_i16,
            'bin_format': 'h',
            'numpy_type': np.int16 },

    'i32': {'binname' : b" i32",
            'size' : 4,
            'bin_reader': read_bin_i32,
            'str_reader': read_str_i32,
            'bin_format': 'i',
            'numpy_type': np.int32 },

    'i64': {'binname' : b" i64",
            'size' : 8,
            'bin_reader': read_bin_i64,
            'str_reader': read_str_i64,
            'bin_format': 'q',
            'numpy_type': np.int64},

    'u8':  {'binname' : b"  u8",
            'size' : 1,
            'bin_reader': read_bin_u8,
            'str_reader': read_str_u8,
            'bin_format': 'B',
            'numpy_type': np.uint8 },

    'u16': {'binname' : b" u16",
            'size' : 2,
            'bin_reader': read_bin_u16,
            'str_reader': read_str_u16,
            'bin_format': 'H',
            'numpy_type': np.uint16 },

    'u32': {'binname' : b" u32",
            'size' : 4,
            'bin_reader': read_bin_u32,
            'str_reader': read_str_u32,
            'bin_format': 'I',
            'numpy_type': np.uint32 },

    'u64': {'binname' : b" u64",
            'size' : 8,
            'bin_reader': read_bin_u64,
            'str_reader': read_str_u64,
            'bin_format': 'Q',
            'numpy_type': np.uint64 },

    'f32': {'binname' : b" f32",
            'size' : 4,
            'bin_reader': read_bin_f32,
            'str_reader': read_str_f32,
            'bin_format': 'f',
            'numpy_type': np.float32 },

    'f64': {'binname' : b" f64",
            'size' : 8,
            'bin_reader': read_bin_f64,
            'str_reader': read_str_f64,
            'bin_format': 'd',
            'numpy_type': np.float64 },

    'bool': {'binname' : b"bool",
             'size' : 1,
             'bin_reader': read_bin_bool,
             'str_reader': read_str_bool,
             'bin_format': 'b',
             'numpy_type': np.bool }
}

def read_bin_read_type(f):
    read_binname = f.get_chars(4)

    for (k,v) in FUTHARK_PRIMTYPES.items():
        if v['binname'] == read_binname:
            return k
    panic(1, "binary-input: Did not recognize the type '%s'.\n", read_binname)

def numpy_type_to_type_name(t):
    for (k,v) in FUTHARK_PRIMTYPES.items():
        if v['numpy_type'] == t:
            return k
    raise Exception('Unknown Numpy type: {}'.format(t))

def read_bin_ensure_scalar(f, expected_type):
  dims = read_bin_i8(f)

  if dims != 0:
      panic(1, "binary-input: Expected scalar (0 dimensions), but got array with %i dimensions.\n", dims)

  bin_type = read_bin_read_type(f)
  if bin_type != expected_type:
      panic(1, "binary-input: Expected scalar of type %s but got scalar of type %s.\n",
            expected_type, bin_type)

# ------------------------------------------------------------------------------
# General interface for reading Primitive Futhark Values
# ------------------------------------------------------------------------------

def read_scalar(f, ty):
    if read_is_binary(f):
        read_bin_ensure_scalar(f, ty)
        return FUTHARK_PRIMTYPES[ty]['bin_reader'](f)
    return FUTHARK_PRIMTYPES[ty]['str_reader'](f)

def read_array(f, expected_type, rank):
    if not read_is_binary(f):
        str_reader = FUTHARK_PRIMTYPES[expected_type]['str_reader']
        return read_str_array(f, str_reader, expected_type, rank,
                              FUTHARK_PRIMTYPES[expected_type]['numpy_type'])

    bin_rank = read_bin_u8(f)

    if bin_rank != rank:
        panic(1, "binary-input: Expected %i dimensions, but got array with %i dimensions.\n",
              rank, bin_rank)

    bin_type_enum = read_bin_read_type(f)
    if expected_type != bin_type_enum:
        panic(1, "binary-input: Expected %iD-array with element type '%s' but got %iD-array with element type '%s'.\n",
              rank, expected_type, bin_rank, bin_type_enum)

    shape = []
    elem_count = 1
    for i in range(rank):
        bin_size = read_bin_u64(f)
        elem_count *= bin_size
        shape.append(bin_size)

    bin_fmt = FUTHARK_PRIMTYPES[bin_type_enum]['bin_format']

    # We first read the expected number of types into a bytestring,
    # then use np.fromstring.  This is because np.fromfile does not
    # work on things that are insufficiently file-like, like a network
    # stream.
    bytes = f.get_chars(elem_count * FUTHARK_PRIMTYPES[expected_type]['size'])
    arr = np.fromstring(bytes, dtype='<'+bin_fmt)
    arr.shape = shape

    return arr

if sys.version_info >= (3,0):
    input_reader = ReaderInput(sys.stdin.buffer)
else:
    input_reader = ReaderInput(sys.stdin)

import re

def read_value(type_desc, reader=input_reader):
    """Read a value of the given type.  The type is a string
representation of the Futhark type."""
    m = re.match(r'((?:\[\])*)([a-z0-9]+)$', type_desc)
    if m:
        dims = int(len(m.group(1))/2)
        basetype = m.group(2)
        assert basetype in FUTHARK_PRIMTYPES, "Unknown type: {}".format(type_desc)
        if dims > 0:
            return read_array(reader, basetype, dims)
        else:
            return read_scalar(reader, basetype)
        return (dims, basetype)

def write_value_text(v, out=sys.stdout):
    if type(v) == np.uint8:
        out.write("%uu8" % v)
    elif type(v) == np.uint16:
        out.write("%uu16" % v)
    elif type(v) == np.uint32:
        out.write("%uu32" % v)
    elif type(v) == np.uint64:
        out.write("%uu64" % v)
    elif type(v) == np.int8:
        out.write("%di8" % v)
    elif type(v) == np.int16:
        out.write("%di16" % v)
    elif type(v) == np.int32:
        out.write("%di32" % v)
    elif type(v) == np.int64:
        out.write("%di64" % v)
    elif type(v) in [np.bool, np.bool_]:
        if v:
            out.write("true")
        else:
            out.write("false")
    elif type(v) == np.float32:
        if np.isnan(v):
            out.write('f32.nan')
        elif np.isinf(v):
            if v >= 0:
                out.write('f32.inf')
            else:
                out.write('-f32.inf')
        else:
            out.write("%.6ff32" % v)
    elif type(v) == np.float64:
        if np.isnan(v):
            out.write('f64.nan')
        elif np.isinf(v):
            if v >= 0:
                out.write('f64.inf')
            else:
                out.write('-f64.inf')
        else:
            out.write("%.6ff64" % v)
    elif type(v) == np.ndarray:
        if np.product(v.shape) == 0:
            tname = numpy_type_to_type_name(v.dtype)
            out.write('empty({}{})'.format(''.join(['[]' for _ in v.shape[1:]]), tname))
        else:
            first = True
            out.write('[')
            for x in v:
                if not first: out.write(', ')
                first = False
                write_value(x, out=out)
            out.write(']')
    else:
        raise Exception("Cannot print value of type {}: {}".format(type(v), v))

type_strs = { np.dtype('int8'): b'  i8',
              np.dtype('int16'): b' i16',
              np.dtype('int32'): b' i32',
              np.dtype('int64'): b' i64',
              np.dtype('uint8'): b'  u8',
              np.dtype('uint16'): b' u16',
              np.dtype('uint32'): b' u32',
              np.dtype('uint64'): b' u64',
              np.dtype('float32'): b' f32',
              np.dtype('float64'): b' f64',
              np.dtype('bool'): b'bool'}

def construct_binary_value(v):
    t = v.dtype
    shape = v.shape

    elems = 1
    for d in shape:
        elems *= d

    num_bytes = 1 + 1 + 1 + 4 + len(shape) * 8 + elems * t.itemsize
    bytes = bytearray(num_bytes)
    bytes[0] = np.int8(ord('b'))
    bytes[1] = 2
    bytes[2] = np.int8(len(shape))
    bytes[3:7] = type_strs[t]

    for i in range(len(shape)):
        bytes[7+i*8:7+(i+1)*8] = np.int64(shape[i]).tostring()

    bytes[7+len(shape)*8:] = np.ascontiguousarray(v).tostring()

    return bytes

def write_value_binary(v, out=sys.stdout):
    if sys.version_info >= (3,0):
        out = out.buffer
    out.write(construct_binary_value(v))

def write_value(v, out=sys.stdout, binary=False):
    if binary:
        return write_value_binary(v, out=out)
    else:
        return write_value_text(v, out=out)

################################################################################
### end of values.py
################################################################################
# Helper functions dealing with memory blocks.

import ctypes as ct

def addressOffset(x, offset, bt):
  return ct.cast(ct.addressof(x.contents)+int(offset), ct.POINTER(bt))

def allocateMem(size):
  return ct.cast((ct.c_byte * max(0,size))(), ct.POINTER(ct.c_byte))

# Copy an array if its is not-None.  This is important for treating
# Numpy arrays as flat memory, but has some overhead.
def normaliseArray(x):
  if (x.base is x) or (x.base is None):
    return x
  else:
    return x.copy()

def unwrapArray(x):
  return normaliseArray(x).ctypes.data_as(ct.POINTER(ct.c_byte))

def createArray(x, dim):
  return np.ctypeslib.as_array(x, shape=dim)

def indexArray(x, offset, bt, nptype):
  return nptype(addressOffset(x, offset, bt)[0])

def writeScalarArray(x, offset, v):
  ct.memmove(ct.addressof(x.contents)+int(offset), ct.addressof(v), ct.sizeof(v))

# An opaque Futhark value.
class opaque(object):
  def __init__(self, desc, *payload):
    self.data = payload
    self.desc = desc

  def __repr__(self):
    return "<opaque Futhark value of type {}>".format(self.desc)
def panic(exitcode, fmt, *args):
    sys.stderr.write('%s: ' % sys.argv[0])
    sys.stderr.write(fmt % args)
    sys.exit(exitcode)
### start of tuning.py
###
### Reading the .tuning file.

def read_tuning_file(kvs, f):
    for line in f.read().splitlines():
        size, value = line.split('=')
        kvs[size] = int(value)
    return kvs

### end of tuning.py
# Scalar functions.

import numpy as np
import struct

def signed(x):
  if type(x) == np.uint8:
    return np.int8(x)
  elif type(x) == np.uint16:
    return np.int16(x)
  elif type(x) == np.uint32:
    return np.int32(x)
  else:
    return np.int64(x)

def unsigned(x):
  if type(x) == np.int8:
    return np.uint8(x)
  elif type(x) == np.int16:
    return np.uint16(x)
  elif type(x) == np.int32:
    return np.uint32(x)
  else:
    return np.uint64(x)

def shlN(x,y):
  return x << y

def ashrN(x,y):
  return x >> y

def sdivN(x,y):
  return x // y

def smodN(x,y):
  return x % y

def udivN(x,y):
  return signed(unsigned(x) // unsigned(y))

def umodN(x,y):
  return signed(unsigned(x) % unsigned(y))

def squotN(x,y):
  return np.floor_divide(np.abs(x), np.abs(y)) * np.sign(x) * np.sign(y)

def sremN(x,y):
  return np.remainder(np.abs(x), np.abs(y)) * np.sign(x)

def sminN(x,y):
  return min(x,y)

def smaxN(x,y):
  return max(x,y)

def uminN(x,y):
  return signed(min(unsigned(x),unsigned(y)))

def umaxN(x,y):
  return signed(max(unsigned(x),unsigned(y)))

def fminN(x,y):
  return min(x,y)

def fmaxN(x,y):
  return max(x,y)

def powN(x,y):
  return x ** y

def fpowN(x,y):
  return x ** y

def sleN(x,y):
  return x <= y

def sltN(x,y):
  return x < y

def uleN(x,y):
  return unsigned(x) <= unsigned(y)

def ultN(x,y):
  return unsigned(x) < unsigned(y)

def lshr8(x,y):
  return np.int8(np.uint8(x) >> np.uint8(y))

def lshr16(x,y):
  return np.int16(np.uint16(x) >> np.uint16(y))

def lshr32(x,y):
  return np.int32(np.uint32(x) >> np.uint32(y))

def lshr64(x,y):
  return np.int64(np.uint64(x) >> np.uint64(y))

def sext_T_i8(x):
  return np.int8(x)

def sext_T_i16(x):
  return np.int16(x)

def sext_T_i32(x):
  return np.int32(x)

def sext_T_i64(x):
  return np.int64(x)

def itob_T_bool(x):
  return np.bool(x)

def btoi_bool_i8(x):
  return np.int8(x)

def btoi_bool_i16(x):
  return np.int8(x)

def btoi_bool_i32(x):
  return np.int8(x)

def btoi_bool_i64(x):
  return np.int8(x)

def zext_i8_i8(x):
  return np.int8(np.uint8(x))

def zext_i8_i16(x):
  return np.int16(np.uint8(x))

def zext_i8_i32(x):
  return np.int32(np.uint8(x))

def zext_i8_i64(x):
  return np.int64(np.uint8(x))

def zext_i16_i8(x):
  return np.int8(np.uint16(x))

def zext_i16_i16(x):
  return np.int16(np.uint16(x))

def zext_i16_i32(x):
  return np.int32(np.uint16(x))

def zext_i16_i64(x):
  return np.int64(np.uint16(x))

def zext_i32_i8(x):
  return np.int8(np.uint32(x))

def zext_i32_i16(x):
  return np.int16(np.uint32(x))

def zext_i32_i32(x):
  return np.int32(np.uint32(x))

def zext_i32_i64(x):
  return np.int64(np.uint32(x))

def zext_i64_i8(x):
  return np.int8(np.uint64(x))

def zext_i64_i16(x):
  return np.int16(np.uint64(x))

def zext_i64_i32(x):
  return np.int32(np.uint64(x))

def zext_i64_i64(x):
  return np.int64(np.uint64(x))

shl8 = shl16 = shl32 = shl64 = shlN
ashr8 = ashr16 = ashr32 = ashr64 = ashrN
sdiv8 = sdiv16 = sdiv32 = sdiv64 = sdivN
smod8 = smod16 = smod32 = smod64 = smodN
udiv8 = udiv16 = udiv32 = udiv64 = udivN
umod8 = umod16 = umod32 = umod64 = umodN
squot8 = squot16 = squot32 = squot64 = squotN
srem8 = srem16 = srem32 = srem64 = sremN
smax8 = smax16 = smax32 = smax64 = smaxN
smin8 = smin16 = smin32 = smin64 = sminN
umax8 = umax16 = umax32 = umax64 = umaxN
umin8 = umin16 = umin32 = umin64 = uminN
pow8 = pow16 = pow32 = pow64 = powN
fpow32 = fpow64 = fpowN
fmax32 = fmax64 = fmaxN
fmin32 = fmin64 = fminN
sle8 = sle16 = sle32 = sle64 = sleN
slt8 = slt16 = slt32 = slt64 = sltN
ule8 = ule16 = ule32 = ule64 = uleN
ult8 = ult16 = ult32 = ult64 = ultN
sext_i8_i8 = sext_i16_i8 = sext_i32_i8 = sext_i64_i8 = sext_T_i8
sext_i8_i16 = sext_i16_i16 = sext_i32_i16 = sext_i64_i16 = sext_T_i16
sext_i8_i32 = sext_i16_i32 = sext_i32_i32 = sext_i64_i32 = sext_T_i32
sext_i8_i64 = sext_i16_i64 = sext_i32_i64 = sext_i64_i64 = sext_T_i64
itob_i8_bool = itob_i16_bool = itob_i32_bool = itob_i64_bool = itob_T_bool

def ssignum(x):
  return np.sign(x)

def usignum(x):
  if x < 0:
    return ssignum(-x)
  else:
    return ssignum(x)

def sitofp_T_f32(x):
  return np.float32(x)
sitofp_i8_f32 = sitofp_i16_f32 = sitofp_i32_f32 = sitofp_i64_f32 = sitofp_T_f32

def sitofp_T_f64(x):
  return np.float64(x)
sitofp_i8_f64 = sitofp_i16_f64 = sitofp_i32_f64 = sitofp_i64_f64 = sitofp_T_f64

def uitofp_T_f32(x):
  return np.float32(unsigned(x))
uitofp_i8_f32 = uitofp_i16_f32 = uitofp_i32_f32 = uitofp_i64_f32 = uitofp_T_f32

def uitofp_T_f64(x):
  return np.float64(unsigned(x))
uitofp_i8_f64 = uitofp_i16_f64 = uitofp_i32_f64 = uitofp_i64_f64 = uitofp_T_f64

def fptosi_T_i8(x):
  return np.int8(np.trunc(x))
fptosi_f32_i8 = fptosi_f64_i8 = fptosi_T_i8

def fptosi_T_i16(x):
  return np.int16(np.trunc(x))
fptosi_f32_i16 = fptosi_f64_i16 = fptosi_T_i16

def fptosi_T_i32(x):
  return np.int32(np.trunc(x))
fptosi_f32_i32 = fptosi_f64_i32 = fptosi_T_i32

def fptosi_T_i64(x):
  return np.int64(np.trunc(x))
fptosi_f32_i64 = fptosi_f64_i64 = fptosi_T_i64

def fptoui_T_i8(x):
  return np.uint8(np.trunc(x))
fptoui_f32_i8 = fptoui_f64_i8 = fptoui_T_i8

def fptoui_T_i16(x):
  return np.uint16(np.trunc(x))
fptoui_f32_i16 = fptoui_f64_i16 = fptoui_T_i16

def fptoui_T_i32(x):
  return np.uint32(np.trunc(x))
fptoui_f32_i32 = fptoui_f64_i32 = fptoui_T_i32

def fptoui_T_i64(x):
  return np.uint64(np.trunc(x))
fptoui_f32_i64 = fptoui_f64_i64 = fptoui_T_i64

def fpconv_f32_f64(x):
  return np.float64(x)

def fpconv_f64_f32(x):
  return np.float32(x)

def futhark_log64(x):
  return np.float64(np.log(x))

def futhark_log2_64(x):
  return np.float64(np.log2(x))

def futhark_log10_64(x):
  return np.float64(np.log10(x))

def futhark_sqrt64(x):
  return np.sqrt(x)

def futhark_exp64(x):
  return np.exp(x)

def futhark_cos64(x):
  return np.cos(x)

def futhark_sin64(x):
  return np.sin(x)

def futhark_tan64(x):
  return np.tan(x)

def futhark_acos64(x):
  return np.arccos(x)

def futhark_asin64(x):
  return np.arcsin(x)

def futhark_atan64(x):
  return np.arctan(x)

def futhark_atan2_64(x, y):
  return np.arctan2(x, y)

def futhark_round64(x):
  return np.round(x)

def futhark_isnan64(x):
  return np.isnan(x)

def futhark_isinf64(x):
  return np.isinf(x)

def futhark_to_bits64(x):
  s = struct.pack('>d', x)
  return np.int64(struct.unpack('>q', s)[0])

def futhark_from_bits64(x):
  s = struct.pack('>q', x)
  return np.float64(struct.unpack('>d', s)[0])

def futhark_log32(x):
  return np.float32(np.log(x))

def futhark_log2_32(x):
  return np.float32(np.log2(x))

def futhark_log10_32(x):
  return np.float32(np.log10(x))

def futhark_sqrt32(x):
  return np.float32(np.sqrt(x))

def futhark_exp32(x):
  return np.exp(x)

def futhark_cos32(x):
  return np.cos(x)

def futhark_sin32(x):
  return np.sin(x)

def futhark_tan32(x):
  return np.tan(x)

def futhark_acos32(x):
  return np.arccos(x)

def futhark_asin32(x):
  return np.arcsin(x)

def futhark_atan32(x):
  return np.arctan(x)

def futhark_atan2_32(x, y):
  return np.arctan2(x, y)

def futhark_round32(x):
  return np.round(x)

def futhark_isnan32(x):
  return np.isnan(x)

def futhark_isinf32(x):
  return np.isinf(x)

def futhark_to_bits32(x):
  s = struct.pack('>f', x)
  return np.int32(struct.unpack('>l', s)[0])

def futhark_from_bits32(x):
  s = struct.pack('>l', x)
  return np.float32(struct.unpack('>f', s)[0])
class bfastfinaldetailed:
  entry_points = {"main": (["i32", "i32", "i32", "f32", "f32", "f32", "[]i32",
                            "[][]f32"], ["[]f32", "[]i32", "[]i32", "[]f32",
                                         "[][]f32", "[][]f32", "[]f32", "[]i32",
                                         "[]f32", "[][]f32", "[][]f32"]),
                  "remove_nans": (["i16", "[][][]i16"], ["[][][]f32"]),
                  "reshapeTransp": (["[][][]f32"], ["[][]f32"])}
  def __init__(self, command_queue=None, interactive=False,
               platform_pref=preferred_platform, device_pref=preferred_device,
               default_group_size=default_group_size,
               default_num_groups=default_num_groups,
               default_tile_size=default_tile_size,
               default_threshold=default_threshold, sizes=sizes):
    size_heuristics=[("NVIDIA CUDA", cl.device_type.GPU, "lockstep_width", 32),
     ("AMD Accelerated Parallel Processing", cl.device_type.GPU, "lockstep_width",
      64), ("", cl.device_type.GPU, "lockstep_width", 1), ("", cl.device_type.GPU,
                                                           "num_groups", 256), ("",
                                                                                cl.device_type.GPU,
                                                                                "group_size",
                                                                                256),
     ("", cl.device_type.GPU, "tile_size", 32), ("", cl.device_type.GPU,
                                                 "threshold", 32768), ("",
                                                                       cl.device_type.CPU,
                                                                       "lockstep_width",
                                                                       1), ("",
                                                                            cl.device_type.CPU,
                                                                            "num_groups",
                                                                            "MAX_COMPUTE_UNITS"),
     ("", cl.device_type.CPU, "group_size", 32), ("", cl.device_type.CPU,
                                                  "tile_size", 4), ("",
                                                                    cl.device_type.CPU,
                                                                    "threshold",
                                                                    "MAX_COMPUTE_UNITS")]
    program = initialise_opencl_object(self,
                                       program_src=fut_opencl_src,
                                       command_queue=command_queue,
                                       interactive=interactive,
                                       platform_pref=platform_pref,
                                       device_pref=device_pref,
                                       default_group_size=default_group_size,
                                       default_num_groups=default_num_groups,
                                       default_tile_size=default_tile_size,
                                       default_threshold=default_threshold,
                                       size_heuristics=size_heuristics,
                                       required_types=["i16", "i32", "f32", "bool"],
                                       user_sizes=sizes,
                                       all_sizes={"main.group_size_32085": {"class": "group_size", "value": None},
                                        "main.group_size_32287": {"class": "group_size", "value": None},
                                        "main.group_size_32441": {"class": "group_size", "value": None},
                                        "main.group_size_32508": {"class": "group_size", "value": None},
                                        "main.group_size_32612": {"class": "group_size", "value": None},
                                        "main.group_size_32804": {"class": "group_size", "value": None},
                                        "main.group_size_33486": {"class": "group_size", "value": None},
                                        "main.group_size_33536": {"class": "group_size", "value": None},
                                        "main.group_size_33597": {"class": "group_size", "value": None},
                                        "main.group_size_33694": {"class": "group_size", "value": None},
                                        "main.group_size_33855": {"class": "group_size", "value": None},
                                        "main.group_size_34020": {"class": "group_size", "value": None},
                                        "main.group_size_34191": {"class": "group_size", "value": None},
                                        "main.group_size_34286": {"class": "group_size", "value": None},
                                        "main.group_size_34352": {"class": "group_size", "value": None},
                                        "main.group_size_34516": {"class": "group_size", "value": None},
                                        "main.group_size_34671": {"class": "group_size", "value": None},
                                        "main.group_size_34852": {"class": "group_size", "value": None},
                                        "main.group_size_35027": {"class": "group_size", "value": None},
                                        "main.group_size_35127": {"class": "group_size", "value": None},
                                        "main.group_size_35281": {"class": "group_size", "value": None},
                                        "main.group_size_35426": {"class": "group_size", "value": None},
                                        "main.group_size_35449": {"class": "group_size", "value": None},
                                        "main.group_size_35474": {"class": "group_size", "value": None},
                                        "main.group_size_35577": {"class": "group_size", "value": None},
                                        "main.group_size_35618": {"class": "group_size", "value": None},
                                        "main.group_size_35705": {"class": "group_size", "value": None},
                                        "main.group_size_35801": {"class": "group_size", "value": None},
                                        "main.group_size_35886": {"class": "group_size", "value": None},
                                        "main.group_size_36289": {"class": "group_size", "value": None},
                                        "main.group_size_36338": {"class": "group_size", "value": None},
                                        "main.group_size_36371": {"class": "group_size", "value": None},
                                        "main.group_size_36405": {"class": "group_size", "value": None},
                                        "main.group_size_36446": {"class": "group_size", "value": None},
                                        "main.group_size_36479": {"class": "group_size", "value": None},
                                        "main.group_size_38709": {"class": "group_size", "value": None},
                                        "main.group_size_38775": {"class": "group_size", "value": None},
                                        "main.group_size_38964": {"class": "group_size", "value": None},
                                        "main.group_size_38971": {"class": "group_size", "value": None},
                                        "main.group_size_38976": {"class": "group_size", "value": None},
                                        "main.group_size_38981": {"class": "group_size", "value": None},
                                        "main.group_size_39219": {"class": "group_size", "value": None},
                                        "main.group_size_39299": {"class": "group_size", "value": None},
                                        "main.max_num_groups_32806": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_34022": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_34354": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_34673": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_35129": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_35451": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_35476": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_35579": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_35707": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_36407": {"class": "num_groups", "value": None},
                                        "main.max_num_groups_36448": {"class": "num_groups", "value": None},
                                        "main.suff_intra_par_11": {"class": "threshold (!main.suff_outer_par_10 !main.suff_outer_par_8 !main.suff_intra_par_9 !main.suff_outer_par_6 !main.suff_intra_par_7)",
                                                                   "value": None},
                                        "main.suff_intra_par_13": {"class": "threshold (!main.suff_outer_par_12)",
                                                                   "value": None},
                                        "main.suff_intra_par_18": {"class": "threshold (!main.suff_outer_par_17)",
                                                                   "value": None},
                                        "main.suff_intra_par_20": {"class": "threshold (!main.suff_outer_par_19 !main.suff_outer_par_17 !main.suff_intra_par_18)",
                                                                   "value": None},
                                        "main.suff_intra_par_22": {"class": "threshold (!main.suff_outer_par_21)",
                                                                   "value": None},
                                        "main.suff_intra_par_24": {"class": "threshold (!main.suff_outer_par_23 !main.suff_outer_par_21 !main.suff_intra_par_22)",
                                                                   "value": None},
                                        "main.suff_intra_par_26": {"class": "threshold (!main.suff_outer_par_25)",
                                                                   "value": None},
                                        "main.suff_intra_par_28": {"class": "threshold (!main.suff_outer_par_27 !main.suff_outer_par_25 !main.suff_intra_par_26)",
                                                                   "value": None},
                                        "main.suff_intra_par_30": {"class": "threshold (!main.suff_outer_par_29)",
                                                                   "value": None},
                                        "main.suff_intra_par_34": {"class": "threshold (!main.suff_outer_par_33)",
                                                                   "value": None},
                                        "main.suff_intra_par_36": {"class": "threshold (!main.suff_outer_par_35)",
                                                                   "value": None},
                                        "main.suff_intra_par_39": {"class": "threshold (!main.suff_outer_par_38)",
                                                                   "value": None},
                                        "main.suff_intra_par_7": {"class": "threshold (!main.suff_outer_par_6)",
                                                                  "value": None},
                                        "main.suff_intra_par_9": {"class": "threshold (!main.suff_outer_par_8 !main.suff_outer_par_6 !main.suff_intra_par_7)",
                                                                  "value": None},
                                        "main.suff_outer_par_10": {"class": "threshold (!main.suff_outer_par_8 !main.suff_intra_par_9 !main.suff_outer_par_6 !main.suff_intra_par_7)",
                                                                   "value": None},
                                        "main.suff_outer_par_17": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_19": {"class": "threshold (!main.suff_outer_par_17 !main.suff_intra_par_18)",
                                                                   "value": None},
                                        "main.suff_outer_par_21": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_23": {"class": "threshold (!main.suff_outer_par_21 !main.suff_intra_par_22)",
                                                                   "value": None},
                                        "main.suff_outer_par_25": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_27": {"class": "threshold (!main.suff_outer_par_25 !main.suff_intra_par_26)",
                                                                   "value": None},
                                        "main.suff_outer_par_29": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_33": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_35": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_38": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_6": {"class": "threshold ()", "value": None},
                                        "main.suff_outer_par_8": {"class": "threshold (!main.suff_outer_par_6 !main.suff_intra_par_7)",
                                                                  "value": None},
                                        "main.tile_size_37202": {"class": "tile_size", "value": None},
                                        "main.tile_size_37709": {"class": "tile_size", "value": None},
                                        "main.tile_size_37759": {"class": "tile_size", "value": None},
                                        "remove_nans.group_size_31908": {"class": "group_size", "value": None}})
    self.copy_38706_var = program.copy_38706
    self.copy_38772_var = program.copy_38772
    self.copy_38968_var = program.copy_38968
    self.map_31914_var = program.map_31914
    self.map_32091_var = program.map_32091
    self.map_32293_var = program.map_32293
    self.map_32447_var = program.map_32447
    self.map_32514_var = program.map_32514
    self.map_32618_var = program.map_32618
    self.map_32725_var = program.map_32725
    self.map_33492_var = program.map_33492
    self.map_33542_var = program.map_33542
    self.map_33603_var = program.map_33603
    self.map_33700_var = program.map_33700
    self.map_33861_var = program.map_33861
    self.map_33955_var = program.map_33955
    self.map_34197_var = program.map_34197
    self.map_34292_var = program.map_34292
    self.map_34522_var = program.map_34522
    self.map_34610_var = program.map_34610
    self.map_34858_var = program.map_34858
    self.map_35033_var = program.map_35033
    self.map_35287_var = program.map_35287
    self.map_35432_var = program.map_35432
    self.map_35624_var = program.map_35624
    self.map_35807_var = program.map_35807
    self.map_35892_var = program.map_35892
    self.map_36295_var = program.map_36295
    self.map_36344_var = program.map_36344
    self.map_36377_var = program.map_36377
    self.map_36485_var = program.map_36485
    self.map_intra_group_32493_var = program.map_intra_group_32493
    self.map_intra_group_32641_var = program.map_intra_group_32641
    self.map_intra_group_32738_var = program.map_intra_group_32738
    self.map_intra_group_33149_var = program.map_intra_group_33149
    self.map_intra_group_33844_var = program.map_intra_group_33844
    self.map_intra_group_33966_var = program.map_intra_group_33966
    self.map_intra_group_34180_var = program.map_intra_group_34180
    self.map_intra_group_34303_var = program.map_intra_group_34303
    self.map_intra_group_34505_var = program.map_intra_group_34505
    self.map_intra_group_34621_var = program.map_intra_group_34621
    self.map_intra_group_34818_var = program.map_intra_group_34818
    self.map_intra_group_35256_var = program.map_intra_group_35256
    self.map_intra_group_35604_var = program.map_intra_group_35604
    self.map_intra_group_35851_var = program.map_intra_group_35851
    self.map_transpose_f32_var = program.map_transpose_f32
    self.map_transpose_f32_low_height_var = program.map_transpose_f32_low_height
    self.map_transpose_f32_low_width_var = program.map_transpose_f32_low_width
    self.map_transpose_f32_small_var = program.map_transpose_f32_small
    self.map_transpose_i32_var = program.map_transpose_i32
    self.map_transpose_i32_low_height_var = program.map_transpose_i32_low_height
    self.map_transpose_i32_low_width_var = program.map_transpose_i32_low_width
    self.map_transpose_i32_small_var = program.map_transpose_i32_small
    self.replicate_38973_var = program.replicate_38973
    self.replicate_38978_var = program.replicate_38978
    self.replicate_39296_var = program.replicate_39296
    self.scan_stage1_35145_var = program.scan_stage1_35145
    self.scan_stage1_36464_var = program.scan_stage1_36464
    self.scan_stage2_38948_var = program.scan_stage2_38948
    self.scan_stage2_39203_var = program.scan_stage2_39203
    self.scan_stage3_38961_var = program.scan_stage3_38961
    self.scan_stage3_39216_var = program.scan_stage3_39216
    self.segred_large_32822_var = program.segred_large_32822
    self.segred_large_34038_var = program.segred_large_34038
    self.segred_large_34370_var = program.segred_large_34370
    self.segred_large_34689_var = program.segred_large_34689
    self.segred_large_35467_var = program.segred_large_35467
    self.segred_large_35492_var = program.segred_large_35492
    self.segred_large_35723_var = program.segred_large_35723
    self.segred_large_36423_var = program.segred_large_36423
    self.segred_nonseg_35595_var = program.segred_nonseg_35595
    self.segred_small_32822_var = program.segred_small_32822
    self.segred_small_34038_var = program.segred_small_34038
    self.segred_small_34370_var = program.segred_small_34370
    self.segred_small_34689_var = program.segred_small_34689
    self.segred_small_35467_var = program.segred_small_35467
    self.segred_small_35492_var = program.segred_small_35492
    self.segred_small_35723_var = program.segred_small_35723
    self.segred_small_36423_var = program.segred_small_36423
    counter_mem_38670 = np.zeros(1024, dtype=np.int32)
    static_mem_39303 = opencl_alloc(self, 4096, "static_mem_39303")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39303,
                      normaliseArray(counter_mem_38670),
                      is_blocking=synchronous)
    self.counter_mem_38670 = static_mem_39303
    counter_mem_38755 = np.zeros(1024, dtype=np.int32)
    static_mem_39306 = opencl_alloc(self, 4096, "static_mem_39306")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39306,
                      normaliseArray(counter_mem_38755),
                      is_blocking=synchronous)
    self.counter_mem_38755 = static_mem_39306
    counter_mem_38816 = np.zeros(1024, dtype=np.int32)
    static_mem_39307 = opencl_alloc(self, 4096, "static_mem_39307")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39307,
                      normaliseArray(counter_mem_38816),
                      is_blocking=synchronous)
    self.counter_mem_38816 = static_mem_39307
    counter_mem_38877 = np.zeros(1024, dtype=np.int32)
    static_mem_39308 = opencl_alloc(self, 4096, "static_mem_39308")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39308,
                      normaliseArray(counter_mem_38877),
                      is_blocking=synchronous)
    self.counter_mem_38877 = static_mem_39308
    counter_mem_39020 = np.zeros(1024, dtype=np.int32)
    static_mem_39309 = opencl_alloc(self, 4096, "static_mem_39309")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39309,
                      normaliseArray(counter_mem_39020),
                      is_blocking=synchronous)
    self.counter_mem_39020 = static_mem_39309
    counter_mem_39055 = np.zeros(1024, dtype=np.int32)
    static_mem_39310 = opencl_alloc(self, 4096, "static_mem_39310")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39310,
                      normaliseArray(counter_mem_39055),
                      is_blocking=synchronous)
    self.counter_mem_39055 = static_mem_39310
    counter_mem_39076 = np.array([np.int32(0)], dtype=np.int32)
    static_mem_39311 = opencl_alloc(self, 4, "static_mem_39311")
    if (4 != 0):
      cl.enqueue_copy(self.queue, static_mem_39311,
                      normaliseArray(counter_mem_39076),
                      is_blocking=synchronous)
    self.counter_mem_39076 = static_mem_39311
    counter_mem_39124 = np.zeros(1024, dtype=np.int32)
    static_mem_39313 = opencl_alloc(self, 4096, "static_mem_39313")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39313,
                      normaliseArray(counter_mem_39124),
                      is_blocking=synchronous)
    self.counter_mem_39124 = static_mem_39313
    counter_mem_39259 = np.zeros(1024, dtype=np.int32)
    static_mem_39315 = opencl_alloc(self, 4096, "static_mem_39315")
    if (4096 != 0):
      cl.enqueue_copy(self.queue, static_mem_39315,
                      normaliseArray(counter_mem_39259),
                      is_blocking=synchronous)
    self.counter_mem_39259 = static_mem_39315
  def futhark_main(self, mappingindices_mem_37893, images_mem_37894,
                   sizze_31214, sizze_31215, sizze_31216, trend_31217, k_31218,
                   n_31219, freq_31220, hfrac_31221, lam_31222):
    dim_zzero_31225 = (np.int32(0) == sizze_31215)
    dim_zzero_31226 = (np.int32(0) == sizze_31216)
    old_empty_31227 = (dim_zzero_31225 or dim_zzero_31226)
    dim_zzero_31228 = (np.int32(0) == sizze_31214)
    new_empty_31229 = (dim_zzero_31225 or dim_zzero_31228)
    both_empty_31230 = (old_empty_31227 and new_empty_31229)
    dim_match_31231 = (sizze_31214 == sizze_31216)
    empty_or_match_31232 = (both_empty_31230 or dim_match_31231)
    empty_or_match_cert_31233 = True
    assert empty_or_match_31232, ("Error at bfastfinaldetailed.fut:112:1-240:86: %s" % ("function arguments of wrong shape",))
    x_31234 = (np.int32(2) * k_31218)
    res_31235 = (np.int32(2) + x_31234)
    cond_31236 = slt32(np.int32(0), trend_31217)
    if cond_31236:
      res_31237 = res_31235
    else:
      res_31238 = (res_31235 - np.int32(1))
      res_31237 = res_31238
    bounds_invalid_upwards_31239 = slt32(res_31237, np.int32(0))
    convop_x_37896 = (sizze_31214 * res_31237)
    binop_x_37897 = sext_i32_i64(convop_x_37896)
    bytes_37895 = (np.int64(4) * binop_x_37897)
    if cond_31236:
      eq_x_zz_31241 = (np.int32(0) == res_31237)
      not_p_31242 = not(bounds_invalid_upwards_31239)
      p_and_eq_x_y_31243 = (eq_x_zz_31241 and not_p_31242)
      dim_zzero_31244 = (bounds_invalid_upwards_31239 or p_and_eq_x_y_31243)
      both_empty_31245 = (eq_x_zz_31241 and dim_zzero_31244)
      empty_or_match_31249 = (not_p_31242 or both_empty_31245)
      empty_or_match_cert_31250 = True
      assert empty_or_match_31249, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:123:16-55 -> bfastfinaldetailed.fut:45:10-18 -> /futlib/array.fut:61:1-62:12: %s%s%s%d%s%s" % ("Function return value does not match shape of type ",
                                                                                                                                                                                                           "*",
                                                                                                                                                                                                           "[",
                                                                                                                                                                                                           res_31237,
                                                                                                                                                                                                           "]",
                                                                                                                                                                                                           "intrinsics.i32"))
      group_sizze_32172 = self.sizes["main.group_size_32085"]
      y_32173 = (group_sizze_32172 - np.int32(1))
      x_32174 = (y_32173 + convop_x_37896)
      num_groups_32175 = squot32(x_32174, group_sizze_32172)
      num_threads_32176 = (group_sizze_32172 * num_groups_32175)
      mem_37898 = opencl_alloc(self, bytes_37895, "mem_37898")
      if ((1 * (np.long(num_groups_32175) * np.long(group_sizze_32172))) != 0):
        self.map_32091_var.set_args(np.int32(sizze_31214),
                                    np.float32(freq_31220), np.int32(res_31237),
                                    mappingindices_mem_37893, mem_37898)
        cl.enqueue_nd_range_kernel(self.queue, self.map_32091_var,
                                   ((np.long(num_groups_32175) * np.long(group_sizze_32172)),),
                                   (np.long(group_sizze_32172),))
        if synchronous:
          self.queue.finish()
      arg_mem_37903 = mem_37898
    else:
      eq_x_zz_31272 = (np.int32(0) == res_31237)
      not_p_31273 = not(bounds_invalid_upwards_31239)
      p_and_eq_x_y_31274 = (eq_x_zz_31272 and not_p_31273)
      dim_zzero_31275 = (bounds_invalid_upwards_31239 or p_and_eq_x_y_31274)
      both_empty_31276 = (eq_x_zz_31272 and dim_zzero_31275)
      empty_or_match_31280 = (not_p_31273 or both_empty_31276)
      empty_or_match_cert_31281 = True
      assert empty_or_match_31280, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:124:16-55 -> bfastfinaldetailed.fut:57:10-20 -> /futlib/array.fut:61:1-62:12: %s%s%s%d%s%s" % ("Function return value does not match shape of type ",
                                                                                                                                                                                                           "*",
                                                                                                                                                                                                           "[",
                                                                                                                                                                                                           res_31237,
                                                                                                                                                                                                           "]",
                                                                                                                                                                                                           "intrinsics.i32"))
      group_sizze_32367 = self.sizes["main.group_size_32287"]
      y_32368 = (group_sizze_32367 - np.int32(1))
      x_32369 = (y_32368 + convop_x_37896)
      num_groups_32370 = squot32(x_32369, group_sizze_32367)
      num_threads_32371 = (group_sizze_32367 * num_groups_32370)
      mem_37902 = opencl_alloc(self, bytes_37895, "mem_37902")
      if ((1 * (np.long(num_groups_32370) * np.long(group_sizze_32367))) != 0):
        self.map_32293_var.set_args(np.int32(sizze_31214),
                                    np.float32(freq_31220), np.int32(res_31237),
                                    mappingindices_mem_37893, mem_37902)
        cl.enqueue_nd_range_kernel(self.queue, self.map_32293_var,
                                   ((np.long(num_groups_32370) * np.long(group_sizze_32367)),),
                                   (np.long(group_sizze_32367),))
        if synchronous:
          self.queue.finish()
      arg_mem_37903 = mem_37902
    x_31302 = (sizze_31214 * sizze_31214)
    y_31303 = (np.int32(2) * sizze_31214)
    x_31304 = (x_31302 + y_31303)
    x_31305 = (np.int32(1) + x_31304)
    y_31306 = (np.int32(1) + sizze_31214)
    x_31307 = sdiv32(x_31305, y_31306)
    x_31308 = (x_31307 - sizze_31214)
    arg_31309 = (x_31308 - np.int32(1))
    res_31310 = sitofp_i32_f32(arg_31309)
    group_sizze_32469 = self.sizes["main.group_size_32441"]
    y_32470 = (group_sizze_32469 - np.int32(1))
    x_32471 = (y_32470 + convop_x_37896)
    num_groups_32472 = squot32(x_32471, group_sizze_32469)
    num_threads_32473 = (group_sizze_32469 * num_groups_32472)
    mem_37907 = opencl_alloc(self, bytes_37895, "mem_37907")
    self.futhark__map_transpose_f32(mem_37907, np.int32(0), arg_mem_37903,
                                    np.int32(0), np.int32(1), sizze_31214,
                                    res_31237, (res_31237 * sizze_31214),
                                    (res_31237 * sizze_31214))
    mem_37911 = opencl_alloc(self, bytes_37895, "mem_37911")
    if ((1 * (np.long(num_groups_32472) * np.long(group_sizze_32469))) != 0):
      self.map_32447_var.set_args(np.int32(sizze_31214), np.int32(res_31237),
                                  np.float32(res_31310), mem_37907, mem_37911)
      cl.enqueue_nd_range_kernel(self.queue, self.map_32447_var,
                                 ((np.long(num_groups_32472) * np.long(group_sizze_32469)),),
                                 (np.long(group_sizze_32469),))
      if synchronous:
        self.queue.finish()
    m_31319 = (res_31237 - np.int32(1))
    empty_slice_31326 = (n_31219 == np.int32(0))
    m_31327 = (n_31219 - np.int32(1))
    zzero_leq_i_p_m_t_s_31328 = sle32(np.int32(0), m_31327)
    i_p_m_t_s_leq_w_31329 = slt32(m_31327, sizze_31214)
    i_lte_j_31330 = sle32(np.int32(0), n_31219)
    y_31331 = (zzero_leq_i_p_m_t_s_31328 and i_p_m_t_s_leq_w_31329)
    y_31332 = (i_lte_j_31330 and y_31331)
    ok_or_empty_31333 = (empty_slice_31326 or y_31332)
    index_certs_31335 = True
    assert ok_or_empty_31333, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:133:15-21: %s%d%s%s%s%d%s%d%s%d%s" % ("Index [",
                                                                                                                                             np.int32(0),
                                                                                                                                             ", ",
                                                                                                                                             "",
                                                                                                                                             ":",
                                                                                                                                             n_31219,
                                                                                                                                             "] out of bounds for array of shape [",
                                                                                                                                             res_31237,
                                                                                                                                             "][",
                                                                                                                                             sizze_31214,
                                                                                                                                             "]."))
    index_certs_31337 = True
    assert ok_or_empty_31333, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:134:15-22: %s%s%s%d%s%d%s%d%s%d%s" % ("Index [",
                                                                                                                                             "",
                                                                                                                                             ":",
                                                                                                                                             n_31219,
                                                                                                                                             ", ",
                                                                                                                                             np.int32(0),
                                                                                                                                             "] out of bounds for array of shape [",
                                                                                                                                             sizze_31214,
                                                                                                                                             "][",
                                                                                                                                             res_31237,
                                                                                                                                             "]."))
    empty_slice_31339 = (sizze_31215 == np.int32(0))
    m_31340 = (sizze_31215 - np.int32(1))
    zzero_leq_i_p_m_t_s_31341 = sle32(np.int32(0), m_31340)
    i_lte_j_31343 = sle32(np.int32(0), sizze_31215)
    y_31345 = (zzero_leq_i_p_m_t_s_31341 and i_lte_j_31343)
    ok_or_empty_31346 = (empty_slice_31339 or y_31345)
    index_ok_31347 = (ok_or_empty_31333 and ok_or_empty_31346)
    index_certs_31348 = True
    assert index_ok_31347, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:135:15-26: %s%d%s%s%s%d%s%d%s%d%s" % ("Index [",
                                                                                                                                          np.int32(0),
                                                                                                                                          ", ",
                                                                                                                                          "",
                                                                                                                                          ":",
                                                                                                                                          n_31219,
                                                                                                                                          "] out of bounds for array of shape [",
                                                                                                                                          sizze_31215,
                                                                                                                                          "][",
                                                                                                                                          sizze_31214,
                                                                                                                                          "]."))
    suff_outer_par_32478 = (self.sizes["main.suff_outer_par_6"] <= sizze_31215)
    one_intra_par_min_32488 = (res_31237 * res_31237)
    intra_avail_par_32490 = smin32(res_31237, one_intra_par_min_32488)
    computed_group_sizze_32491 = smax32(res_31237, one_intra_par_min_32488)
    num_threads_32492 = (sizze_31215 * computed_group_sizze_32491)
    max_group_sizze_32576 = self.max_group_size
    fits_32577 = sle32(computed_group_sizze_32491, max_group_sizze_32576)
    suff_intra_par_32575 = (self.sizes["main.suff_intra_par_7"] <= intra_avail_par_32490)
    intra_suff_and_fits_32578 = (suff_intra_par_32575 and fits_32577)
    convop_x_37913 = (sizze_31215 * sizze_31216)
    binop_x_37914 = sext_i32_i64(convop_x_37913)
    bytes_37912 = (np.int64(4) * binop_x_37914)
    convop_x_37928 = (sizze_31215 * one_intra_par_min_32488)
    binop_x_37929 = sext_i32_i64(convop_x_37928)
    bytes_37926 = (np.int64(4) * binop_x_37929)
    binop_x_37932 = (sizze_31215 * res_31237)
    convop_x_37933 = (res_31237 * binop_x_37932)
    binop_x_37934 = sext_i32_i64(convop_x_37933)
    bytes_37931 = (np.int64(4) * binop_x_37934)
    group_sizze_32971 = self.sizes["main.group_size_32612"]
    y_32972 = (group_sizze_32971 - np.int32(1))
    x_32973 = (y_32972 + binop_x_37932)
    suff_outer_par_32976 = (self.sizes["main.suff_outer_par_8"] <= binop_x_37932)
    fits_32980 = sle32(res_31237, max_group_sizze_32576)
    suff_intra_par_32981 = (self.sizes["main.suff_intra_par_9"] <= res_31237)
    intra_suff_and_fits_32982 = (fits_32980 and suff_intra_par_32981)
    suff_outer_par_33044 = (self.sizes["main.suff_outer_par_10"] <= convop_x_37928)
    num_threads_33047 = (n_31219 * convop_x_37928)
    fits_33049 = sle32(n_31219, max_group_sizze_32576)
    suff_intra_par_33050 = (self.sizes["main.suff_intra_par_11"] <= n_31219)
    intra_suff_and_fits_33051 = (fits_33049 and suff_intra_par_33050)
    if suff_outer_par_32478:
      group_sizze_32543 = self.sizes["main.group_size_32508"]
      y_32544 = (group_sizze_32543 - np.int32(1))
      x_32545 = (sizze_31215 + y_32544)
      num_groups_32546 = squot32(x_32545, group_sizze_32543)
      num_threads_32547 = (group_sizze_32543 * num_groups_32546)
      mem_37915 = opencl_alloc(self, bytes_37912, "mem_37915")
      self.futhark__map_transpose_f32(mem_37915, np.int32(0), images_mem_37894,
                                      np.int32(0), np.int32(1), sizze_31216,
                                      sizze_31215, (sizze_31215 * sizze_31216),
                                      (sizze_31215 * sizze_31216))
      mem_37930 = opencl_alloc(self, bytes_37926, "mem_37930")
      binop_x_37918 = sext_i32_i64(one_intra_par_min_32488)
      bytes_37916 = (np.int64(4) * binop_x_37918)
      num_threads64_38481 = sext_i32_i64(num_threads_32547)
      total_sizze_38482 = (bytes_37916 * num_threads64_38481)
      mem_37919 = opencl_alloc(self, total_sizze_38482, "mem_37919")
      if ((1 * (np.long(num_groups_32546) * np.long(group_sizze_32543))) != 0):
        self.map_32514_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(n_31219),
                                    np.int32(res_31237), arg_mem_37903,
                                    mem_37911, mem_37915, mem_37919, mem_37930)
        cl.enqueue_nd_range_kernel(self.queue, self.map_32514_var,
                                   ((np.long(num_groups_32546) * np.long(group_sizze_32543)),),
                                   (np.long(group_sizze_32543),))
        if synchronous:
          self.queue.finish()
      mem_37915 = None
      mem_37919 = None
      mem_37935 = opencl_alloc(self, bytes_37931, "mem_37935")
      self.futhark__map_transpose_f32(mem_37935, np.int32(0), mem_37930,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      (res_31237 * res_31237),
                                      ((sizze_31215 * res_31237) * res_31237),
                                      ((sizze_31215 * res_31237) * res_31237))
      mem_37930 = None
      res_mem_38006 = mem_37935
    else:
      if intra_suff_and_fits_32578:
        mem_37948 = opencl_alloc(self, bytes_37931, "mem_37948")
        binop_x_37938 = sext_i32_i64(res_31237)
        bytes_37937 = (np.int64(4) * binop_x_37938)
        binop_x_37942 = sext_i32_i64(one_intra_par_min_32488)
        bytes_37940 = (np.int64(4) * binop_x_37942)
        num_threads64_38483 = sext_i32_i64(num_threads_32492)
        total_sizze_38484 = (bytes_37937 * num_threads64_38483)
        mem_37939 = opencl_alloc(self, total_sizze_38484, "mem_37939")
        if ((1 * (np.long(sizze_31215) * np.long(computed_group_sizze_32491))) != 0):
          self.map_intra_group_32493_var.set_args(cl.LocalMemory(np.long(bytes_37940)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(sizze_31216),
                                                  np.int32(n_31219),
                                                  np.int32(res_31237),
                                                  np.int32(computed_group_sizze_32491),
                                                  images_mem_37894,
                                                  arg_mem_37903, mem_37911,
                                                  mem_37939, mem_37948)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_32493_var,
                                     ((np.long(sizze_31215) * np.long(computed_group_sizze_32491)),),
                                     (np.long(computed_group_sizze_32491),))
          if synchronous:
            self.queue.finish()
        mem_37939 = None
        res_mem_38005 = mem_37948
      else:
        num_groups_32974 = squot32(x_32973, group_sizze_32971)
        num_threads_32975 = (group_sizze_32971 * num_groups_32974)
        if suff_outer_par_32976:
          mem_37956 = opencl_alloc(self, bytes_37931, "mem_37956")
          binop_x_37950 = sext_i32_i64(res_31237)
          bytes_37949 = (np.int64(4) * binop_x_37950)
          num_threads64_38485 = sext_i32_i64(num_threads_32975)
          total_sizze_38486 = (bytes_37949 * num_threads64_38485)
          mem_37951 = opencl_alloc(self, total_sizze_38486, "mem_37951")
          if ((1 * (np.long(num_groups_32974) * np.long(group_sizze_32971))) != 0):
            self.map_32618_var.set_args(np.int32(sizze_31215),
                                        np.int32(sizze_31216),
                                        np.int32(n_31219), np.int32(res_31237),
                                        images_mem_37894, mem_37907, mem_37911,
                                        mem_37951, mem_37956)
            cl.enqueue_nd_range_kernel(self.queue, self.map_32618_var,
                                       ((np.long(num_groups_32974) * np.long(group_sizze_32971)),),
                                       (np.long(group_sizze_32971),))
            if synchronous:
              self.queue.finish()
          mem_37951 = None
          mem_37961 = opencl_alloc(self, bytes_37931, "mem_37961")
          self.futhark__map_transpose_f32(mem_37961, np.int32(0), mem_37956,
                                          np.int32(0), np.int32(1),
                                          (sizze_31215 * res_31237), res_31237,
                                          ((sizze_31215 * res_31237) * res_31237),
                                          ((sizze_31215 * res_31237) * res_31237))
          mem_37956 = None
          res_mem_38004 = mem_37961
        else:
          if intra_suff_and_fits_32982:
            mem_37969 = opencl_alloc(self, bytes_37931, "mem_37969")
            binop_x_37964 = sext_i32_i64(res_31237)
            bytes_37963 = (np.int64(4) * binop_x_37964)
            if ((1 * (np.long(binop_x_37932) * np.long(res_31237))) != 0):
              self.map_intra_group_32641_var.set_args(cl.LocalMemory(np.long(bytes_37963)),
                                                      np.int32(sizze_31215),
                                                      np.int32(sizze_31216),
                                                      np.int32(n_31219),
                                                      np.int32(res_31237),
                                                      images_mem_37894,
                                                      mem_37907, mem_37911,
                                                      mem_37969)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_intra_group_32641_var,
                                         ((np.long(binop_x_37932) * np.long(res_31237)),),
                                         (np.long(res_31237),))
              if synchronous:
                self.queue.finish()
            res_mem_38003 = mem_37969
          else:
            if suff_outer_par_33044:
              tmp_37201 = (np.int32(29) + sizze_31215)
              gidzz_range_37200 = squot32(tmp_37201, np.int32(30))
              tile_sizze_37203 = self.sizes["main.tile_size_37202"]
              tile_sizze_x_37204 = smin32(res_31237, tile_sizze_37203)
              tiled_group_sizze_37206 = (tile_sizze_x_37204 * tile_sizze_x_37204)
              y_37213 = (tile_sizze_x_37204 - np.int32(1))
              x_37214 = (res_31237 + y_37213)
              groups_in_dim_37215 = squot32(x_37214, tile_sizze_x_37204)
              y_37220 = (groups_in_dim_37215 * groups_in_dim_37215)
              num_groups_37221 = (gidzz_range_37200 * y_37220)
              num_threads_37222 = (tiled_group_sizze_37206 * num_groups_37221)
              mem_37974 = opencl_alloc(self, bytes_37931, "mem_37974")
              mem_37978 = opencl_alloc(self, bytes_37912, "mem_37978")
              self.futhark__map_transpose_f32(mem_37978, np.int32(0),
                                              images_mem_37894, np.int32(0),
                                              np.int32(1), sizze_31216,
                                              sizze_31215,
                                              (sizze_31215 * sizze_31216),
                                              (sizze_31215 * sizze_31216))
              binop_x_37980 = sext_i32_i64(tiled_group_sizze_37206)
              bytes_37979 = (np.int64(4) * binop_x_37980)
              if ((1 * (np.long(num_groups_37221) * np.long(tiled_group_sizze_37206))) != 0):
                self.map_32725_var.set_args(cl.LocalMemory(np.long(bytes_37979)),
                                            np.int32(sizze_31215),
                                            np.int32(n_31219),
                                            np.int32(res_31237),
                                            np.int32(gidzz_range_37200),
                                            np.int32(tile_sizze_x_37204),
                                            np.int32(tiled_group_sizze_37206),
                                            mem_37907, mem_37911, mem_37974,
                                            mem_37978)
                cl.enqueue_nd_range_kernel(self.queue, self.map_32725_var,
                                           ((np.long(num_groups_37221) * np.long(tiled_group_sizze_37206)),),
                                           (np.long(tiled_group_sizze_37206),))
                if synchronous:
                  self.queue.finish()
              mem_37978 = None
              res_mem_38002 = mem_37974
            else:
              if intra_suff_and_fits_33051:
                mem_37985 = opencl_alloc(self, bytes_37895, "mem_37985")
                self.futhark__map_transpose_f32(mem_37985, np.int32(0),
                                                mem_37911, np.int32(0),
                                                np.int32(1), res_31237,
                                                sizze_31214,
                                                (sizze_31214 * res_31237),
                                                (sizze_31214 * res_31237))
                mem_37991 = opencl_alloc(self, bytes_37926, "mem_37991")
                binop_x_37987 = sext_i32_i64(n_31219)
                bytes_37986 = (np.int64(4) * binop_x_37987)
                if ((1 * (np.long(convop_x_37928) * np.long(n_31219))) != 0):
                  self.map_intra_group_32738_var.set_args(cl.LocalMemory(np.long(bytes_37986)),
                                                          np.int32(sizze_31214),
                                                          np.int32(sizze_31215),
                                                          np.int32(sizze_31216),
                                                          np.int32(n_31219),
                                                          np.int32(res_31237),
                                                          images_mem_37894,
                                                          arg_mem_37903,
                                                          mem_37985, mem_37991)
                  cl.enqueue_nd_range_kernel(self.queue,
                                             self.map_intra_group_32738_var,
                                             ((np.long(convop_x_37928) * np.long(n_31219)),),
                                             (np.long(n_31219),))
                  if synchronous:
                    self.queue.finish()
                mem_37985 = None
                res_mem_38001 = mem_37991
              else:
                total_num_elements_33108 = sext_i32_i64(num_threads_33047)
                group_sizze_33109 = self.sizes["main.group_size_32804"]
                max_num_groups_33110 = self.sizes["main.max_num_groups_32806"]
                group_sizze_33111 = sext_i32_i64(group_sizze_33109)
                max_num_groups_33112 = sext_i32_i64(max_num_groups_33110)
                y_33113 = (group_sizze_33111 - np.int64(1))
                x_33114 = (total_num_elements_33108 + y_33113)
                w_div_group_sizze_33115 = squot64(x_33114, group_sizze_33111)
                num_groups_maybe_zzero_33116 = smin64(max_num_groups_33112,
                                                      w_div_group_sizze_33115)
                num_groups_33117 = smax64(np.int64(1),
                                          num_groups_maybe_zzero_33116)
                num_threads_33118 = (group_sizze_33111 * num_groups_33117)
                num_groups_33119 = sext_i64_i32(num_groups_33117)
                num_threads_33120 = sext_i64_i32(num_threads_33118)
                mem_37995 = opencl_alloc(self, bytes_37895, "mem_37995")
                self.futhark__map_transpose_f32(mem_37995, np.int32(0),
                                                mem_37911, np.int32(0),
                                                np.int32(1), res_31237,
                                                sizze_31214,
                                                (sizze_31214 * res_31237),
                                                (sizze_31214 * res_31237))
                mem_38000 = opencl_alloc(self, bytes_37931, "mem_38000")
                if slt32((n_31219 * np.int32(2)), group_sizze_33109):
                  segment_sizze_nonzzero_38654 = smax32(np.int32(1), n_31219)
                  if ((1 * (np.long(num_groups_33119) * np.long(group_sizze_33109))) != 0):
                    self.segred_small_32822_var.set_args(np.int32(sizze_31214),
                                                         np.int32(sizze_31215),
                                                         np.int32(sizze_31216),
                                                         np.int32(n_31219),
                                                         np.int32(res_31237),
                                                         np.int32(num_groups_33119),
                                                         images_mem_37894,
                                                         arg_mem_37903,
                                                         mem_37995, mem_38000,
                                                         np.int32(segment_sizze_nonzzero_38654))
                    cl.enqueue_nd_range_kernel(self.queue,
                                               self.segred_small_32822_var,
                                               ((np.long(num_groups_33119) * np.long(group_sizze_33109)),),
                                               (np.long(group_sizze_33109),))
                    if synchronous:
                      self.queue.finish()
                else:
                  num_groups_38665 = (squot32(((num_groups_33119 + smax32(np.int32(1),
                                                                          ((sizze_31215 * res_31237) * res_31237))) - np.int32(1)),
                                              smax32(np.int32(1),
                                                     ((sizze_31215 * res_31237) * res_31237))) * ((sizze_31215 * res_31237) * res_31237))
                  num_threads_38666 = (num_groups_38665 * group_sizze_33109)
                  thread_per_segment_38667 = (squot32(((num_groups_33119 + smax32(np.int32(1),
                                                                                  ((sizze_31215 * res_31237) * res_31237))) - np.int32(1)),
                                                      smax32(np.int32(1),
                                                             ((sizze_31215 * res_31237) * res_31237))) * group_sizze_33109)
                  group_res_arr_mem_38668 = opencl_alloc(self,
                                                         (np.int32(4) * num_groups_38665),
                                                         "group_res_arr_mem_38668")
                  counter_mem_38670 = self.counter_mem_38670
                  if ((1 * (np.long(num_groups_38665) * np.long(group_sizze_33109))) != 0):
                    self.segred_large_32822_var.set_args(np.int32(sizze_31214),
                                                         np.int32(sizze_31215),
                                                         np.int32(sizze_31216),
                                                         np.int32(n_31219),
                                                         np.int32(res_31237),
                                                         np.int32(num_groups_33119),
                                                         images_mem_37894,
                                                         arg_mem_37903,
                                                         mem_37995, mem_38000,
                                                         np.int32(thread_per_segment_38667),
                                                         group_res_arr_mem_38668,
                                                         counter_mem_38670)
                    cl.enqueue_nd_range_kernel(self.queue,
                                               self.segred_large_32822_var,
                                               ((np.long(num_groups_38665) * np.long(group_sizze_33109)),),
                                               (np.long(group_sizze_33109),))
                    if synchronous:
                      self.queue.finish()
                mem_37995 = None
                res_mem_38001 = mem_38000
              res_mem_38002 = res_mem_38001
            res_mem_38003 = res_mem_38002
          res_mem_38004 = res_mem_38003
        res_mem_38005 = res_mem_38004
      res_mem_38006 = res_mem_38005
    j_31369 = (np.int32(2) * res_31237)
    j_m_i_31370 = (j_31369 - res_31237)
    res_31373 = (res_31237 * j_31369)
    empty_slice_31386 = (j_m_i_31370 == np.int32(0))
    m_31387 = (j_m_i_31370 - np.int32(1))
    i_p_m_t_s_31388 = (res_31237 + m_31387)
    zzero_leq_i_p_m_t_s_31389 = sle32(np.int32(0), i_p_m_t_s_31388)
    ok_or_empty_31396 = (empty_slice_31386 or zzero_leq_i_p_m_t_s_31389)
    index_certs_31398 = True
    assert ok_or_empty_31396, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:147:14-29 -> bfastfinaldetailed.fut:88:8-37: %s%d%s%d%s%d%s%d%s%d%s%d%s" % ("Index [",
                                                                                                                                                                                   np.int32(0),
                                                                                                                                                                                   ":",
                                                                                                                                                                                   res_31237,
                                                                                                                                                                                   ", ",
                                                                                                                                                                                   res_31237,
                                                                                                                                                                                   ":",
                                                                                                                                                                                   j_31369,
                                                                                                                                                                                   "] out of bounds for array of shape [",
                                                                                                                                                                                   res_31237,
                                                                                                                                                                                   "][",
                                                                                                                                                                                   j_31369,
                                                                                                                                                                                   "]."))
    num_threads_33148 = (sizze_31215 * res_31373)
    fits_33310 = sle32(res_31373, max_group_sizze_32576)
    suff_intra_par_33308 = (self.sizes["main.suff_intra_par_13"] <= res_31373)
    intra_suff_and_fits_33311 = (suff_intra_par_33308 and fits_33310)
    convop_x_38017 = (j_m_i_31370 * binop_x_37932)
    binop_x_38018 = sext_i32_i64(convop_x_38017)
    bytes_38015 = (np.int64(4) * binop_x_38018)
    binop_x_38022 = sext_i32_i64(num_threads_33148)
    bytes_38020 = (np.int64(4) * binop_x_38022)
    if intra_suff_and_fits_33311:
      mem_38019 = opencl_alloc(self, bytes_38015, "mem_38019")
      binop_x_38008 = sext_i32_i64(res_31373)
      bytes_38007 = (np.int64(4) * binop_x_38008)
      if ((1 * (np.long(sizze_31215) * np.long(res_31373))) != 0):
        self.map_intra_group_33149_var.set_args(cl.LocalMemory(np.long(bytes_38007)),
                                                cl.LocalMemory(np.long(bytes_38007)),
                                                np.int32(sizze_31215),
                                                np.int32(res_31237),
                                                np.int32(m_31319),
                                                np.int32(j_31369),
                                                np.int32(j_m_i_31370),
                                                np.int32(res_31373),
                                                res_mem_38006, mem_38019)
        cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_33149_var,
                                   ((np.long(sizze_31215) * np.long(res_31373)),),
                                   (np.long(res_31373),))
        if synchronous:
          self.queue.finish()
      res_mem_38037 = mem_38019
    else:
      group_sizze_33738 = self.sizes["main.group_size_33694"]
      y_33739 = (group_sizze_33738 - np.int32(1))
      x_33740 = (num_threads_33148 + y_33739)
      num_groups_33741 = squot32(x_33740, group_sizze_33738)
      num_threads_33742 = (group_sizze_33738 * num_groups_33741)
      mem_38023 = opencl_alloc(self, bytes_38020, "mem_38023")
      if ((1 * (np.long(num_groups_33741) * np.long(group_sizze_33738))) != 0):
        self.map_33700_var.set_args(np.int32(sizze_31215), np.int32(res_31237),
                                    np.int32(j_31369), np.int32(res_31373),
                                    res_mem_38006, mem_38023)
        cl.enqueue_nd_range_kernel(self.queue, self.map_33700_var,
                                   ((np.long(num_groups_33741) * np.long(group_sizze_33738)),),
                                   (np.long(group_sizze_33738),))
        if synchronous:
          self.queue.finish()
      loop_nonempty_36884 = slt32(np.int32(0), res_31237)
      group_sizze_33759 = self.sizes["main.group_size_33597"]
      y_33760 = (group_sizze_33759 - np.int32(1))
      x_33761 = (sizze_31215 + y_33760)
      if loop_nonempty_36884:
        x_36885 = squot32(x_33761, group_sizze_33759)
        num_groups_33762 = x_36885
      else:
        num_groups_33762 = np.int32(0)
      num_threads_33763 = (group_sizze_33759 * num_groups_33762)
      group_sizze_33777 = self.sizes["main.group_size_33536"]
      y_33778 = (group_sizze_33777 - np.int32(1))
      x_33779 = (num_threads_33148 + y_33778)
      if loop_nonempty_36884:
        x_36887 = squot32(x_33779, group_sizze_33777)
        num_groups_33780 = x_36887
      else:
        num_groups_33780 = np.int32(0)
      num_threads_33781 = (group_sizze_33777 * num_groups_33780)
      group_sizze_33807 = self.sizes["main.group_size_33486"]
      y_33808 = (group_sizze_33807 - np.int32(1))
      x_33809 = (num_threads_33148 + y_33808)
      if loop_nonempty_36884:
        x_36889 = squot32(x_33809, group_sizze_33807)
        num_groups_33810 = x_36889
      else:
        num_groups_33810 = np.int32(0)
      num_threads_33811 = (group_sizze_33807 * num_groups_33810)
      bytes_38025 = sext_i32_i64(sizze_31215)
      mem_38026 = opencl_alloc(self, bytes_38025, "mem_38026")
      mem_38030 = opencl_alloc(self, bytes_38020, "mem_38030")
      i_33757 = np.int32(0)
      one_39305 = np.int32(1)
      for counter_39304 in range(res_31237):
        if ((1 * (np.long(num_groups_33762) * np.long(group_sizze_33759))) != 0):
          self.map_33603_var.set_args(np.int32(sizze_31215),
                                      np.int32(res_31373), np.int32(i_33757),
                                      mem_38023, mem_38026)
          cl.enqueue_nd_range_kernel(self.queue, self.map_33603_var,
                                     ((np.long(num_groups_33762) * np.long(group_sizze_33759)),),
                                     (np.long(group_sizze_33759),))
          if synchronous:
            self.queue.finish()
        if ((1 * (np.long(num_groups_33780) * np.long(group_sizze_33777))) != 0):
          self.map_33542_var.set_args(np.int32(sizze_31215), np.int32(m_31319),
                                      np.int32(j_31369), np.int32(res_31373),
                                      np.int32(i_33757), mem_38023, mem_38026,
                                      mem_38030)
          cl.enqueue_nd_range_kernel(self.queue, self.map_33542_var,
                                     ((np.long(num_groups_33780) * np.long(group_sizze_33777)),),
                                     (np.long(group_sizze_33777),))
          if synchronous:
            self.queue.finish()
        if ((1 * (np.long(num_groups_33810) * np.long(group_sizze_33807))) != 0):
          self.map_33492_var.set_args(np.int32(sizze_31215),
                                      np.int32(res_31373), mem_38023, mem_38030)
          cl.enqueue_nd_range_kernel(self.queue, self.map_33492_var,
                                     ((np.long(num_groups_33810) * np.long(group_sizze_33807)),),
                                     (np.long(group_sizze_33807),))
          if synchronous:
            self.queue.finish()
        i_33757 += one_39305
      mem_38026 = None
      mem_38030 = None
      mem_38036 = opencl_alloc(self, bytes_38015, "mem_38036")
      group_sizze_38709 = self.sizes["main.group_size_38709"]
      num_groups_38710 = squot32((((sizze_31215 * (res_31237 * j_m_i_31370)) + sext_i32_i32(group_sizze_38709)) - np.int32(1)),
                                 sext_i32_i32(group_sizze_38709))
      if ((1 * (np.long(num_groups_38710) * np.long(group_sizze_38709))) != 0):
        self.copy_38706_var.set_args(np.int32(sizze_31215), np.int32(res_31237),
                                     np.int32(j_31369), np.int32(j_m_i_31370),
                                     mem_38023, mem_38036)
        cl.enqueue_nd_range_kernel(self.queue, self.copy_38706_var,
                                   ((np.long(num_groups_38710) * np.long(group_sizze_38709)),),
                                   (np.long(group_sizze_38709),))
        if synchronous:
          self.queue.finish()
      mem_38023 = None
      res_mem_38037 = mem_38036
    res_mem_38006 = None
    suff_outer_par_33828 = (self.sizes["main.suff_outer_par_17"] <= sizze_31215)
    suff_intra_par_33906 = (self.sizes["main.suff_intra_par_18"] <= res_31237)
    intra_suff_and_fits_33909 = (fits_32980 and suff_intra_par_33906)
    binop_x_38047 = sext_i32_i64(binop_x_37932)
    bytes_38045 = (np.int64(4) * binop_x_38047)
    suff_outer_par_34082 = (self.sizes["main.suff_outer_par_19"] <= binop_x_37932)
    num_threads_34084 = (n_31219 * binop_x_37932)
    suff_intra_par_34087 = (self.sizes["main.suff_intra_par_20"] <= n_31219)
    intra_suff_and_fits_34088 = (fits_33049 and suff_intra_par_34087)
    if suff_outer_par_33828:
      group_sizze_33882 = self.sizes["main.group_size_33855"]
      y_33883 = (group_sizze_33882 - np.int32(1))
      x_33884 = (sizze_31215 + y_33883)
      num_groups_33885 = squot32(x_33884, group_sizze_33882)
      num_threads_33886 = (group_sizze_33882 * num_groups_33885)
      mem_38041 = opencl_alloc(self, bytes_37912, "mem_38041")
      self.futhark__map_transpose_f32(mem_38041, np.int32(0), images_mem_37894,
                                      np.int32(0), np.int32(1), sizze_31216,
                                      sizze_31215, (sizze_31215 * sizze_31216),
                                      (sizze_31215 * sizze_31216))
      mem_38048 = opencl_alloc(self, bytes_38045, "mem_38048")
      binop_x_38043 = sext_i32_i64(res_31237)
      bytes_38042 = (np.int64(4) * binop_x_38043)
      num_threads64_38496 = sext_i32_i64(num_threads_33886)
      total_sizze_38497 = (bytes_38042 * num_threads64_38496)
      mem_38044 = opencl_alloc(self, total_sizze_38497, "mem_38044")
      if ((1 * (np.long(num_groups_33885) * np.long(group_sizze_33882))) != 0):
        self.map_33861_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(n_31219),
                                    np.int32(res_31237), arg_mem_37903,
                                    mem_38041, mem_38044, mem_38048)
        cl.enqueue_nd_range_kernel(self.queue, self.map_33861_var,
                                   ((np.long(num_groups_33885) * np.long(group_sizze_33882)),),
                                   (np.long(group_sizze_33882),))
        if synchronous:
          self.queue.finish()
      mem_38041 = None
      mem_38044 = None
      mem_38052 = opencl_alloc(self, bytes_38045, "mem_38052")
      self.futhark__map_transpose_f32(mem_38052, np.int32(0), mem_38048,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      res_31237, (sizze_31215 * res_31237),
                                      (sizze_31215 * res_31237))
      mem_38048 = None
      res_mem_38086 = mem_38052
    else:
      if intra_suff_and_fits_33909:
        mem_38060 = opencl_alloc(self, bytes_38045, "mem_38060")
        binop_x_38055 = sext_i32_i64(res_31237)
        bytes_38054 = (np.int64(4) * binop_x_38055)
        if ((1 * (np.long(sizze_31215) * np.long(res_31237))) != 0):
          self.map_intra_group_33844_var.set_args(cl.LocalMemory(np.long(bytes_38054)),
                                                  np.int32(sizze_31215),
                                                  np.int32(sizze_31216),
                                                  np.int32(n_31219),
                                                  np.int32(res_31237),
                                                  images_mem_37894, mem_37907,
                                                  mem_38060)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_33844_var,
                                     ((np.long(sizze_31215) * np.long(res_31237)),),
                                     (np.long(res_31237),))
          if synchronous:
            self.queue.finish()
        res_mem_38085 = mem_38060
      else:
        if suff_outer_par_34082:
          tile_sizze_37710 = self.sizes["main.tile_size_37709"]
          tiled_group_sizze_37711 = (tile_sizze_37710 * tile_sizze_37710)
          y_37714 = (tile_sizze_37710 - np.int32(1))
          x_37715 = (sizze_31215 + y_37714)
          groups_in_dim_37716 = squot32(x_37715, tile_sizze_37710)
          x_37718 = (res_31237 + y_37714)
          groups_in_dim_37719 = squot32(x_37718, tile_sizze_37710)
          num_groups_37721 = (groups_in_dim_37716 * groups_in_dim_37719)
          num_threads_37722 = (tiled_group_sizze_37711 * num_groups_37721)
          mem_38072 = opencl_alloc(self, bytes_38045, "mem_38072")
          binop_x_38063 = sext_i32_i64(tiled_group_sizze_37711)
          bytes_38061 = (np.int64(4) * binop_x_38063)
          if ((1 * (np.long(num_groups_37721) * np.long(tiled_group_sizze_37711))) != 0):
            self.map_33955_var.set_args(np.int32(sizze_31215),
                                        np.int32(sizze_31216),
                                        np.int32(n_31219), np.int32(res_31237),
                                        images_mem_37894, mem_37907, mem_38072)
            cl.enqueue_nd_range_kernel(self.queue, self.map_33955_var,
                                       ((np.long(num_groups_37721) * np.long(tiled_group_sizze_37711)),),
                                       (np.long(tiled_group_sizze_37711),))
            if synchronous:
              self.queue.finish()
          res_mem_38084 = mem_38072
        else:
          if intra_suff_and_fits_34088:
            mem_38078 = opencl_alloc(self, bytes_38045, "mem_38078")
            binop_x_38074 = sext_i32_i64(n_31219)
            bytes_38073 = (np.int64(4) * binop_x_38074)
            if ((1 * (np.long(binop_x_37932) * np.long(n_31219))) != 0):
              self.map_intra_group_33966_var.set_args(cl.LocalMemory(np.long(bytes_38073)),
                                                      np.int32(sizze_31214),
                                                      np.int32(sizze_31215),
                                                      np.int32(sizze_31216),
                                                      np.int32(n_31219),
                                                      np.int32(res_31237),
                                                      images_mem_37894,
                                                      arg_mem_37903, mem_38078)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_intra_group_33966_var,
                                         ((np.long(binop_x_37932) * np.long(n_31219)),),
                                         (np.long(n_31219),))
              if synchronous:
                self.queue.finish()
            res_mem_38083 = mem_38078
          else:
            total_num_elements_34134 = sext_i32_i64(num_threads_34084)
            group_sizze_34135 = self.sizes["main.group_size_34020"]
            max_num_groups_34136 = self.sizes["main.max_num_groups_34022"]
            group_sizze_34137 = sext_i32_i64(group_sizze_34135)
            max_num_groups_34138 = sext_i32_i64(max_num_groups_34136)
            y_34139 = (group_sizze_34137 - np.int64(1))
            x_34140 = (total_num_elements_34134 + y_34139)
            w_div_group_sizze_34141 = squot64(x_34140, group_sizze_34137)
            num_groups_maybe_zzero_34142 = smin64(max_num_groups_34138,
                                                  w_div_group_sizze_34141)
            num_groups_34143 = smax64(np.int64(1), num_groups_maybe_zzero_34142)
            num_threads_34144 = (group_sizze_34137 * num_groups_34143)
            num_groups_34145 = sext_i64_i32(num_groups_34143)
            num_threads_34146 = sext_i64_i32(num_threads_34144)
            mem_38082 = opencl_alloc(self, bytes_38045, "mem_38082")
            if slt32((n_31219 * np.int32(2)), group_sizze_34135):
              segment_sizze_nonzzero_38739 = smax32(np.int32(1), n_31219)
              if ((1 * (np.long(num_groups_34145) * np.long(group_sizze_34135))) != 0):
                self.segred_small_34038_var.set_args(np.int32(sizze_31214),
                                                     np.int32(sizze_31215),
                                                     np.int32(sizze_31216),
                                                     np.int32(n_31219),
                                                     np.int32(res_31237),
                                                     np.int32(num_groups_34145),
                                                     images_mem_37894,
                                                     arg_mem_37903, mem_38082,
                                                     np.int32(segment_sizze_nonzzero_38739))
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_small_34038_var,
                                           ((np.long(num_groups_34145) * np.long(group_sizze_34135)),),
                                           (np.long(group_sizze_34135),))
                if synchronous:
                  self.queue.finish()
            else:
              num_groups_38750 = (squot32(((num_groups_34145 + smax32(np.int32(1),
                                                                      (sizze_31215 * res_31237))) - np.int32(1)),
                                          smax32(np.int32(1),
                                                 (sizze_31215 * res_31237))) * (sizze_31215 * res_31237))
              num_threads_38751 = (num_groups_38750 * group_sizze_34135)
              thread_per_segment_38752 = (squot32(((num_groups_34145 + smax32(np.int32(1),
                                                                              (sizze_31215 * res_31237))) - np.int32(1)),
                                                  smax32(np.int32(1),
                                                         (sizze_31215 * res_31237))) * group_sizze_34135)
              group_res_arr_mem_38753 = opencl_alloc(self,
                                                     (np.int32(4) * num_groups_38750),
                                                     "group_res_arr_mem_38753")
              counter_mem_38755 = self.counter_mem_38755
              if ((1 * (np.long(num_groups_38750) * np.long(group_sizze_34135))) != 0):
                self.segred_large_34038_var.set_args(np.int32(sizze_31214),
                                                     np.int32(sizze_31215),
                                                     np.int32(sizze_31216),
                                                     np.int32(n_31219),
                                                     np.int32(res_31237),
                                                     np.int32(num_groups_34145),
                                                     images_mem_37894,
                                                     arg_mem_37903, mem_38082,
                                                     np.int32(thread_per_segment_38752),
                                                     group_res_arr_mem_38753,
                                                     counter_mem_38755)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_large_34038_var,
                                           ((np.long(num_groups_38750) * np.long(group_sizze_34135)),),
                                           (np.long(group_sizze_34135),))
                if synchronous:
                  self.queue.finish()
            res_mem_38083 = mem_38082
          res_mem_38084 = res_mem_38083
        res_mem_38085 = res_mem_38084
      res_mem_38086 = res_mem_38085
    arg_mem_37903 = None
    mem_37907 = None
    suff_outer_par_34164 = (self.sizes["main.suff_outer_par_21"] <= sizze_31215)
    suff_intra_par_34242 = (self.sizes["main.suff_intra_par_22"] <= res_31237)
    intra_suff_and_fits_34245 = (fits_32980 and suff_intra_par_34242)
    binop_x_38092 = (res_31237 * j_m_i_31370)
    convop_x_38093 = (sizze_31215 * binop_x_38092)
    binop_x_38094 = sext_i32_i64(convop_x_38093)
    bytes_38091 = (np.int64(4) * binop_x_38094)
    binop_x_38109 = (sizze_31215 * j_m_i_31370)
    convop_x_38110 = (res_31237 * binop_x_38109)
    binop_x_38111 = sext_i32_i64(convop_x_38110)
    bytes_38108 = (np.int64(4) * binop_x_38111)
    group_sizze_34408 = self.sizes["main.group_size_34286"]
    y_34409 = (group_sizze_34408 - np.int32(1))
    x_34410 = (y_34409 + binop_x_37932)
    suff_outer_par_34413 = (self.sizes["main.suff_outer_par_23"] <= binop_x_37932)
    fits_34417 = sle32(j_m_i_31370, max_group_sizze_32576)
    suff_intra_par_34418 = (self.sizes["main.suff_intra_par_24"] <= j_m_i_31370)
    intra_suff_and_fits_34419 = (fits_34417 and suff_intra_par_34418)
    if suff_outer_par_34164:
      group_sizze_34218 = self.sizes["main.group_size_34191"]
      y_34219 = (group_sizze_34218 - np.int32(1))
      x_34220 = (sizze_31215 + y_34219)
      num_groups_34221 = squot32(x_34220, group_sizze_34218)
      num_threads_34222 = (group_sizze_34218 * num_groups_34221)
      mem_38090 = opencl_alloc(self, bytes_38045, "mem_38090")
      self.futhark__map_transpose_f32(mem_38090, np.int32(0), res_mem_38086,
                                      np.int32(0), np.int32(1), res_31237,
                                      sizze_31215, (sizze_31215 * res_31237),
                                      (sizze_31215 * res_31237))
      mem_38095 = opencl_alloc(self, bytes_38091, "mem_38095")
      group_sizze_38775 = self.sizes["main.group_size_38775"]
      num_groups_38776 = squot32((((sizze_31215 * (res_31237 * j_m_i_31370)) + sext_i32_i32(group_sizze_38775)) - np.int32(1)),
                                 sext_i32_i32(group_sizze_38775))
      if ((1 * (np.long(num_groups_38776) * np.long(group_sizze_38775))) != 0):
        self.copy_38772_var.set_args(np.int32(sizze_31215), np.int32(res_31237),
                                     np.int32(j_m_i_31370), res_mem_38037,
                                     mem_38095)
        cl.enqueue_nd_range_kernel(self.queue, self.copy_38772_var,
                                   ((np.long(num_groups_38776) * np.long(group_sizze_38775)),),
                                   (np.long(group_sizze_38775),))
        if synchronous:
          self.queue.finish()
      mem_38102 = opencl_alloc(self, bytes_38045, "mem_38102")
      binop_x_38097 = sext_i32_i64(res_31237)
      bytes_38096 = (np.int64(4) * binop_x_38097)
      num_threads64_38502 = sext_i32_i64(num_threads_34222)
      total_sizze_38503 = (bytes_38096 * num_threads64_38502)
      mem_38098 = opencl_alloc(self, total_sizze_38503, "mem_38098")
      if ((1 * (np.long(num_groups_34221) * np.long(group_sizze_34218))) != 0):
        self.map_34197_var.set_args(np.int32(sizze_31215), np.int32(res_31237),
                                    np.int32(j_m_i_31370), mem_38090, mem_38095,
                                    mem_38098, mem_38102)
        cl.enqueue_nd_range_kernel(self.queue, self.map_34197_var,
                                   ((np.long(num_groups_34221) * np.long(group_sizze_34218)),),
                                   (np.long(group_sizze_34218),))
        if synchronous:
          self.queue.finish()
      mem_38090 = None
      mem_38095 = None
      mem_38098 = None
      mem_38106 = opencl_alloc(self, bytes_38045, "mem_38106")
      self.futhark__map_transpose_f32(mem_38106, np.int32(0), mem_38102,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      res_31237, (sizze_31215 * res_31237),
                                      (sizze_31215 * res_31237))
      mem_38102 = None
      res_mem_38142 = mem_38106
    else:
      if intra_suff_and_fits_34245:
        mem_38112 = opencl_alloc(self, bytes_38108, "mem_38112")
        self.futhark__map_transpose_f32(mem_38112, np.int32(0), res_mem_38037,
                                        np.int32(0), np.int32(1), j_m_i_31370,
                                        (sizze_31215 * res_31237),
                                        ((sizze_31215 * res_31237) * j_m_i_31370),
                                        ((sizze_31215 * res_31237) * j_m_i_31370))
        mem_38119 = opencl_alloc(self, bytes_38045, "mem_38119")
        binop_x_38114 = sext_i32_i64(res_31237)
        bytes_38113 = (np.int64(4) * binop_x_38114)
        if ((1 * (np.long(sizze_31215) * np.long(res_31237))) != 0):
          self.map_intra_group_34180_var.set_args(cl.LocalMemory(np.long(bytes_38113)),
                                                  np.int32(sizze_31215),
                                                  np.int32(res_31237),
                                                  np.int32(j_m_i_31370),
                                                  res_mem_38086, mem_38112,
                                                  mem_38119)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_34180_var,
                                     ((np.long(sizze_31215) * np.long(res_31237)),),
                                     (np.long(res_31237),))
          if synchronous:
            self.queue.finish()
        mem_38112 = None
        res_mem_38141 = mem_38119
      else:
        num_groups_34411 = squot32(x_34410, group_sizze_34408)
        num_threads_34412 = (group_sizze_34408 * num_groups_34411)
        if suff_outer_par_34413:
          mem_38124 = opencl_alloc(self, bytes_38108, "mem_38124")
          self.futhark__map_transpose_f32(mem_38124, np.int32(0), res_mem_38037,
                                          np.int32(0), np.int32(1), j_m_i_31370,
                                          (sizze_31215 * res_31237),
                                          ((sizze_31215 * res_31237) * j_m_i_31370),
                                          ((sizze_31215 * res_31237) * j_m_i_31370))
          mem_38128 = opencl_alloc(self, bytes_38045, "mem_38128")
          if ((1 * (np.long(num_groups_34411) * np.long(group_sizze_34408))) != 0):
            self.map_34292_var.set_args(np.int32(sizze_31215),
                                        np.int32(res_31237),
                                        np.int32(j_m_i_31370), res_mem_38086,
                                        mem_38124, mem_38128)
            cl.enqueue_nd_range_kernel(self.queue, self.map_34292_var,
                                       ((np.long(num_groups_34411) * np.long(group_sizze_34408)),),
                                       (np.long(group_sizze_34408),))
            if synchronous:
              self.queue.finish()
          mem_38124 = None
          res_mem_38140 = mem_38128
        else:
          if intra_suff_and_fits_34419:
            mem_38134 = opencl_alloc(self, bytes_38045, "mem_38134")
            binop_x_38130 = sext_i32_i64(j_m_i_31370)
            bytes_38129 = (np.int64(4) * binop_x_38130)
            if ((1 * (np.long(binop_x_37932) * np.long(j_m_i_31370))) != 0):
              self.map_intra_group_34303_var.set_args(cl.LocalMemory(np.long(bytes_38129)),
                                                      np.int32(sizze_31215),
                                                      np.int32(res_31237),
                                                      np.int32(j_m_i_31370),
                                                      res_mem_38037,
                                                      res_mem_38086, mem_38134)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_intra_group_34303_var,
                                         ((np.long(binop_x_37932) * np.long(j_m_i_31370)),),
                                         (np.long(j_m_i_31370),))
              if synchronous:
                self.queue.finish()
            res_mem_38139 = mem_38134
          else:
            group_sizze_34461 = self.sizes["main.group_size_34352"]
            max_num_groups_34462 = self.sizes["main.max_num_groups_34354"]
            group_sizze_34463 = sext_i32_i64(group_sizze_34461)
            max_num_groups_34464 = sext_i32_i64(max_num_groups_34462)
            y_34465 = (group_sizze_34463 - np.int64(1))
            x_34466 = (y_34465 + binop_x_38018)
            w_div_group_sizze_34467 = squot64(x_34466, group_sizze_34463)
            num_groups_maybe_zzero_34468 = smin64(max_num_groups_34464,
                                                  w_div_group_sizze_34467)
            num_groups_34469 = smax64(np.int64(1), num_groups_maybe_zzero_34468)
            num_threads_34470 = (group_sizze_34463 * num_groups_34469)
            num_groups_34471 = sext_i64_i32(num_groups_34469)
            num_threads_34472 = sext_i64_i32(num_threads_34470)
            mem_38138 = opencl_alloc(self, bytes_38045, "mem_38138")
            if slt32((j_m_i_31370 * np.int32(2)), group_sizze_34461):
              segment_sizze_nonzzero_38800 = smax32(np.int32(1), j_m_i_31370)
              if ((1 * (np.long(num_groups_34471) * np.long(group_sizze_34461))) != 0):
                self.segred_small_34370_var.set_args(np.int32(sizze_31215),
                                                     np.int32(res_31237),
                                                     np.int32(j_m_i_31370),
                                                     np.int32(num_groups_34471),
                                                     res_mem_38037,
                                                     res_mem_38086, mem_38138,
                                                     np.int32(segment_sizze_nonzzero_38800))
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_small_34370_var,
                                           ((np.long(num_groups_34471) * np.long(group_sizze_34461)),),
                                           (np.long(group_sizze_34461),))
                if synchronous:
                  self.queue.finish()
            else:
              num_groups_38811 = (squot32(((num_groups_34471 + smax32(np.int32(1),
                                                                      (sizze_31215 * res_31237))) - np.int32(1)),
                                          smax32(np.int32(1),
                                                 (sizze_31215 * res_31237))) * (sizze_31215 * res_31237))
              num_threads_38812 = (num_groups_38811 * group_sizze_34461)
              thread_per_segment_38813 = (squot32(((num_groups_34471 + smax32(np.int32(1),
                                                                              (sizze_31215 * res_31237))) - np.int32(1)),
                                                  smax32(np.int32(1),
                                                         (sizze_31215 * res_31237))) * group_sizze_34461)
              group_res_arr_mem_38814 = opencl_alloc(self,
                                                     (np.int32(4) * num_groups_38811),
                                                     "group_res_arr_mem_38814")
              counter_mem_38816 = self.counter_mem_38816
              if ((1 * (np.long(num_groups_38811) * np.long(group_sizze_34461))) != 0):
                self.segred_large_34370_var.set_args(np.int32(sizze_31215),
                                                     np.int32(res_31237),
                                                     np.int32(j_m_i_31370),
                                                     np.int32(num_groups_34471),
                                                     res_mem_38037,
                                                     res_mem_38086, mem_38138,
                                                     np.int32(thread_per_segment_38813),
                                                     group_res_arr_mem_38814,
                                                     counter_mem_38816)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_large_34370_var,
                                           ((np.long(num_groups_38811) * np.long(group_sizze_34461)),),
                                           (np.long(group_sizze_34461),))
                if synchronous:
                  self.queue.finish()
            res_mem_38139 = mem_38138
          res_mem_38140 = res_mem_38139
        res_mem_38141 = res_mem_38140
      res_mem_38142 = res_mem_38141
    res_mem_38037 = None
    res_mem_38086 = None
    suff_outer_par_34489 = (self.sizes["main.suff_outer_par_25"] <= sizze_31215)
    num_threads_34504 = (sizze_31214 * sizze_31215)
    fits_34565 = sle32(sizze_31214, max_group_sizze_32576)
    suff_intra_par_34563 = (self.sizes["main.suff_intra_par_26"] <= sizze_31214)
    intra_suff_and_fits_34566 = (suff_intra_par_34563 and fits_34565)
    binop_x_38152 = sext_i32_i64(num_threads_34504)
    bytes_38150 = (np.int64(4) * binop_x_38152)
    suff_outer_par_34731 = (self.sizes["main.suff_outer_par_27"] <= num_threads_34504)
    num_threads_34733 = (res_31237 * num_threads_34504)
    suff_intra_par_34736 = (self.sizes["main.suff_intra_par_28"] <= res_31237)
    intra_suff_and_fits_34737 = (fits_32980 and suff_intra_par_34736)
    if suff_outer_par_34489:
      group_sizze_34541 = self.sizes["main.group_size_34516"]
      y_34542 = (group_sizze_34541 - np.int32(1))
      x_34543 = (sizze_31215 + y_34542)
      num_groups_34544 = squot32(x_34543, group_sizze_34541)
      num_threads_34545 = (group_sizze_34541 * num_groups_34544)
      mem_38146 = opencl_alloc(self, bytes_38045, "mem_38146")
      self.futhark__map_transpose_f32(mem_38146, np.int32(0), res_mem_38142,
                                      np.int32(0), np.int32(1), res_31237,
                                      sizze_31215, (sizze_31215 * res_31237),
                                      (sizze_31215 * res_31237))
      mem_38153 = opencl_alloc(self, bytes_38150, "mem_38153")
      binop_x_38148 = sext_i32_i64(sizze_31214)
      bytes_38147 = (np.int64(4) * binop_x_38148)
      num_threads64_38508 = sext_i32_i64(num_threads_34545)
      total_sizze_38509 = (bytes_38147 * num_threads64_38508)
      mem_38149 = opencl_alloc(self, total_sizze_38509, "mem_38149")
      if ((1 * (np.long(num_groups_34544) * np.long(group_sizze_34541))) != 0):
        self.map_34522_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(res_31237),
                                    mem_37911, mem_38146, mem_38149, mem_38153)
        cl.enqueue_nd_range_kernel(self.queue, self.map_34522_var,
                                   ((np.long(num_groups_34544) * np.long(group_sizze_34541)),),
                                   (np.long(group_sizze_34541),))
        if synchronous:
          self.queue.finish()
      mem_38146 = None
      mem_38149 = None
      mem_38157 = opencl_alloc(self, bytes_38150, "mem_38157")
      self.futhark__map_transpose_f32(mem_38157, np.int32(0), mem_38153,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      sizze_31214, (sizze_31215 * sizze_31214),
                                      (sizze_31215 * sizze_31214))
      mem_38153 = None
      res_mem_38199 = mem_38157
    else:
      if intra_suff_and_fits_34566:
        mem_38162 = opencl_alloc(self, bytes_37895, "mem_38162")
        self.futhark__map_transpose_f32(mem_38162, np.int32(0), mem_37911,
                                        np.int32(0), np.int32(1), res_31237,
                                        sizze_31214, (sizze_31214 * res_31237),
                                        (sizze_31214 * res_31237))
        mem_38169 = opencl_alloc(self, bytes_38150, "mem_38169")
        binop_x_38164 = sext_i32_i64(sizze_31214)
        bytes_38163 = (np.int64(4) * binop_x_38164)
        if ((1 * (np.long(sizze_31215) * np.long(sizze_31214))) != 0):
          self.map_intra_group_34505_var.set_args(cl.LocalMemory(np.long(bytes_38163)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(res_31237),
                                                  res_mem_38142, mem_38162,
                                                  mem_38169)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_34505_var,
                                     ((np.long(sizze_31215) * np.long(sizze_31214)),),
                                     (np.long(sizze_31214),))
          if synchronous:
            self.queue.finish()
        mem_38162 = None
        res_mem_38198 = mem_38169
      else:
        if suff_outer_par_34731:
          mem_38173 = opencl_alloc(self, bytes_37895, "mem_38173")
          self.futhark__map_transpose_f32(mem_38173, np.int32(0), mem_37911,
                                          np.int32(0), np.int32(1), res_31237,
                                          sizze_31214,
                                          (sizze_31214 * res_31237),
                                          (sizze_31214 * res_31237))
          tile_sizze_37760 = self.sizes["main.tile_size_37759"]
          tiled_group_sizze_37761 = (tile_sizze_37760 * tile_sizze_37760)
          y_37764 = (tile_sizze_37760 - np.int32(1))
          x_37765 = (sizze_31215 + y_37764)
          groups_in_dim_37766 = squot32(x_37765, tile_sizze_37760)
          x_37768 = (sizze_31214 + y_37764)
          groups_in_dim_37769 = squot32(x_37768, tile_sizze_37760)
          num_groups_37771 = (groups_in_dim_37766 * groups_in_dim_37769)
          num_threads_37772 = (tiled_group_sizze_37761 * num_groups_37771)
          mem_38185 = opencl_alloc(self, bytes_38150, "mem_38185")
          binop_x_38176 = sext_i32_i64(tiled_group_sizze_37761)
          bytes_38174 = (np.int64(4) * binop_x_38176)
          if ((1 * (np.long(num_groups_37771) * np.long(tiled_group_sizze_37761))) != 0):
            self.map_34610_var.set_args(np.int32(sizze_31214),
                                        np.int32(sizze_31215),
                                        np.int32(res_31237), res_mem_38142,
                                        mem_38173, mem_38185)
            cl.enqueue_nd_range_kernel(self.queue, self.map_34610_var,
                                       ((np.long(num_groups_37771) * np.long(tiled_group_sizze_37761)),),
                                       (np.long(tiled_group_sizze_37761),))
            if synchronous:
              self.queue.finish()
          mem_38173 = None
          res_mem_38197 = mem_38185
        else:
          if intra_suff_and_fits_34737:
            mem_38191 = opencl_alloc(self, bytes_38150, "mem_38191")
            binop_x_38187 = sext_i32_i64(res_31237)
            bytes_38186 = (np.int64(4) * binop_x_38187)
            if ((1 * (np.long(num_threads_34504) * np.long(res_31237))) != 0):
              self.map_intra_group_34621_var.set_args(cl.LocalMemory(np.long(bytes_38186)),
                                                      np.int32(sizze_31214),
                                                      np.int32(sizze_31215),
                                                      np.int32(res_31237),
                                                      mem_37911, res_mem_38142,
                                                      mem_38191)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_intra_group_34621_var,
                                         ((np.long(num_threads_34504) * np.long(res_31237)),),
                                         (np.long(res_31237),))
              if synchronous:
                self.queue.finish()
            res_mem_38196 = mem_38191
          else:
            total_num_elements_34779 = sext_i32_i64(num_threads_34733)
            group_sizze_34780 = self.sizes["main.group_size_34671"]
            max_num_groups_34781 = self.sizes["main.max_num_groups_34673"]
            group_sizze_34782 = sext_i32_i64(group_sizze_34780)
            max_num_groups_34783 = sext_i32_i64(max_num_groups_34781)
            y_34784 = (group_sizze_34782 - np.int64(1))
            x_34785 = (total_num_elements_34779 + y_34784)
            w_div_group_sizze_34786 = squot64(x_34785, group_sizze_34782)
            num_groups_maybe_zzero_34787 = smin64(max_num_groups_34783,
                                                  w_div_group_sizze_34786)
            num_groups_34788 = smax64(np.int64(1), num_groups_maybe_zzero_34787)
            num_threads_34789 = (group_sizze_34782 * num_groups_34788)
            num_groups_34790 = sext_i64_i32(num_groups_34788)
            num_threads_34791 = sext_i64_i32(num_threads_34789)
            mem_38195 = opencl_alloc(self, bytes_38150, "mem_38195")
            if slt32((res_31237 * np.int32(2)), group_sizze_34780):
              segment_sizze_nonzzero_38861 = smax32(np.int32(1), res_31237)
              if ((1 * (np.long(num_groups_34790) * np.long(group_sizze_34780))) != 0):
                self.segred_small_34689_var.set_args(np.int32(sizze_31214),
                                                     np.int32(sizze_31215),
                                                     np.int32(res_31237),
                                                     np.int32(num_groups_34790),
                                                     mem_37911, res_mem_38142,
                                                     mem_38195,
                                                     np.int32(segment_sizze_nonzzero_38861))
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_small_34689_var,
                                           ((np.long(num_groups_34790) * np.long(group_sizze_34780)),),
                                           (np.long(group_sizze_34780),))
                if synchronous:
                  self.queue.finish()
            else:
              num_groups_38872 = (squot32(((num_groups_34790 + smax32(np.int32(1),
                                                                      (sizze_31215 * sizze_31214))) - np.int32(1)),
                                          smax32(np.int32(1),
                                                 (sizze_31215 * sizze_31214))) * (sizze_31215 * sizze_31214))
              num_threads_38873 = (num_groups_38872 * group_sizze_34780)
              thread_per_segment_38874 = (squot32(((num_groups_34790 + smax32(np.int32(1),
                                                                              (sizze_31215 * sizze_31214))) - np.int32(1)),
                                                  smax32(np.int32(1),
                                                         (sizze_31215 * sizze_31214))) * group_sizze_34780)
              group_res_arr_mem_38875 = opencl_alloc(self,
                                                     (np.int32(4) * num_groups_38872),
                                                     "group_res_arr_mem_38875")
              counter_mem_38877 = self.counter_mem_38877
              if ((1 * (np.long(num_groups_38872) * np.long(group_sizze_34780))) != 0):
                self.segred_large_34689_var.set_args(np.int32(sizze_31214),
                                                     np.int32(sizze_31215),
                                                     np.int32(res_31237),
                                                     np.int32(num_groups_34790),
                                                     mem_37911, res_mem_38142,
                                                     mem_38195,
                                                     np.int32(thread_per_segment_38874),
                                                     group_res_arr_mem_38875,
                                                     counter_mem_38877)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.segred_large_34689_var,
                                           ((np.long(num_groups_38872) * np.long(group_sizze_34780)),),
                                           (np.long(group_sizze_34780),))
                if synchronous:
                  self.queue.finish()
            res_mem_38196 = mem_38195
          res_mem_38197 = res_mem_38196
        res_mem_38198 = res_mem_38197
      res_mem_38199 = res_mem_38198
    mem_37911 = None
    res_mem_38142 = None
    i_31490 = (sizze_31214 - np.int32(1))
    x_31491 = sle32(np.int32(0), i_31490)
    index_certs_31494 = True
    assert x_31491, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:167:5-174:25 -> /futlib/soacs.fut:51:3-37 -> /futlib/soacs.fut:51:19-23 -> bfastfinaldetailed.fut:171:30-91 -> bfastfinaldetailed.fut:28:13-20 -> /futlib/array.fut:18:29-34: %s%d%s%d%s" % ("Index [",
                                                                                                                                                                                                                                                                                          i_31490,
                                                                                                                                                                                                                                                                                          "] out of bounds for array of shape [",
                                                                                                                                                                                                                                                                                          sizze_31214,
                                                                                                                                                                                                                                                                                          "]."))
    suff_outer_par_34807 = (self.sizes["main.suff_outer_par_29"] <= sizze_31215)
    suff_intra_par_34979 = (self.sizes["main.suff_intra_par_30"] <= sizze_31214)
    intra_suff_and_fits_34982 = (fits_34565 and suff_intra_par_34979)
    binop_x_38224 = sext_i32_i64(sizze_31215)
    bytes_38223 = (np.int64(4) * binop_x_38224)
    if suff_outer_par_34807:
      group_sizze_34916 = self.sizes["main.group_size_34852"]
      y_34917 = (group_sizze_34916 - np.int32(1))
      x_34918 = (sizze_31215 + y_34917)
      num_groups_34919 = squot32(x_34918, group_sizze_34916)
      num_threads_34920 = (group_sizze_34916 * num_groups_34919)
      mem_38203 = opencl_alloc(self, bytes_37912, "mem_38203")
      self.futhark__map_transpose_f32(mem_38203, np.int32(0), images_mem_37894,
                                      np.int32(0), np.int32(1), sizze_31216,
                                      sizze_31215, (sizze_31215 * sizze_31216),
                                      (sizze_31215 * sizze_31216))
      mem_38207 = opencl_alloc(self, bytes_38150, "mem_38207")
      self.futhark__map_transpose_f32(mem_38207, np.int32(0), res_mem_38199,
                                      np.int32(0), np.int32(1), sizze_31214,
                                      sizze_31215, (sizze_31215 * sizze_31214),
                                      (sizze_31215 * sizze_31214))
      mem_38225 = opencl_alloc(self, bytes_38223, "mem_38225")
      mem_38229 = opencl_alloc(self, bytes_38150, "mem_38229")
      mem_38233 = opencl_alloc(self, bytes_38150, "mem_38233")
      binop_x_38209 = sext_i32_i64(sizze_31214)
      bytes_38208 = (np.int64(4) * binop_x_38209)
      num_threads64_38514 = sext_i32_i64(num_threads_34920)
      total_sizze_38515 = (bytes_38208 * num_threads64_38514)
      mem_38210 = opencl_alloc(self, total_sizze_38515, "mem_38210")
      total_sizze_38516 = (bytes_38208 * num_threads64_38514)
      mem_38213 = opencl_alloc(self, total_sizze_38516, "mem_38213")
      total_sizze_38517 = (bytes_38208 * num_threads64_38514)
      mem_38216 = opencl_alloc(self, total_sizze_38517, "mem_38216")
      total_sizze_38518 = (bytes_38208 * num_threads64_38514)
      mem_38219 = opencl_alloc(self, total_sizze_38518, "mem_38219")
      if ((1 * (np.long(num_groups_34919) * np.long(group_sizze_34916))) != 0):
        self.map_34858_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(i_31490),
                                    mem_38203, mem_38207, mem_38210, mem_38213,
                                    mem_38216, mem_38219, mem_38225, mem_38229,
                                    mem_38233)
        cl.enqueue_nd_range_kernel(self.queue, self.map_34858_var,
                                   ((np.long(num_groups_34919) * np.long(group_sizze_34916)),),
                                   (np.long(group_sizze_34916),))
        if synchronous:
          self.queue.finish()
      mem_38203 = None
      mem_38207 = None
      mem_38210 = None
      mem_38213 = None
      mem_38216 = None
      mem_38219 = None
      mem_38237 = opencl_alloc(self, bytes_38150, "mem_38237")
      self.futhark__map_transpose_f32(mem_38237, np.int32(0), mem_38229,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      sizze_31214, (sizze_31215 * sizze_31214),
                                      (sizze_31215 * sizze_31214))
      mem_38229 = None
      mem_38242 = opencl_alloc(self, bytes_38150, "mem_38242")
      self.futhark__map_transpose_i32(mem_38242, np.int32(0), mem_38233,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      sizze_31214, (sizze_31215 * sizze_31214),
                                      (sizze_31215 * sizze_31214))
      mem_38233 = None
      res_mem_38289 = mem_38225
      res_mem_38290 = mem_38237
      res_mem_38291 = mem_38242
    else:
      if intra_suff_and_fits_34982:
        mem_38258 = opencl_alloc(self, bytes_38223, "mem_38258")
        mem_38262 = opencl_alloc(self, bytes_38150, "mem_38262")
        mem_38266 = opencl_alloc(self, bytes_38150, "mem_38266")
        binop_x_38245 = sext_i32_i64(sizze_31214)
        bytes_38244 = (np.int64(4) * binop_x_38245)
        if ((1 * (np.long(sizze_31215) * np.long(sizze_31214))) != 0):
          self.map_intra_group_34818_var.set_args(cl.LocalMemory(np.long(bytes_38244)),
                                                  cl.LocalMemory(np.long(bytes_38244)),
                                                  cl.LocalMemory(np.long(bytes_38244)),
                                                  cl.LocalMemory(np.long(bytes_38244)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(sizze_31216),
                                                  np.int32(i_31490),
                                                  images_mem_37894,
                                                  res_mem_38199, mem_38258,
                                                  mem_38262, mem_38266)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_34818_var,
                                     ((np.long(sizze_31215) * np.long(sizze_31214)),),
                                     (np.long(sizze_31214),))
          if synchronous:
            self.queue.finish()
        res_mem_38286 = mem_38258
        res_mem_38287 = mem_38262
        res_mem_38288 = mem_38266
      else:
        group_sizze_35152 = self.sizes["main.group_size_35127"]
        max_num_groups_35153 = self.sizes["main.max_num_groups_35129"]
        group_sizze_35154 = sext_i32_i64(group_sizze_35152)
        max_num_groups_35155 = sext_i32_i64(max_num_groups_35153)
        y_35156 = (group_sizze_35154 - np.int64(1))
        x_35157 = (y_35156 + binop_x_38152)
        w_div_group_sizze_35158 = squot64(x_35157, group_sizze_35154)
        num_groups_maybe_zzero_35159 = smin64(max_num_groups_35155,
                                              w_div_group_sizze_35158)
        num_groups_35160 = smax64(np.int64(1), num_groups_maybe_zzero_35159)
        num_threads_35161 = (group_sizze_35154 * num_groups_35160)
        num_groups_35162 = sext_i64_i32(num_groups_35160)
        num_threads_35163 = sext_i64_i32(num_threads_35161)
        mem_38270 = opencl_alloc(self, bytes_38150, "mem_38270")
        mem_38274 = opencl_alloc(self, bytes_38150, "mem_38274")
        if ((1 * (np.long(num_groups_35162) * np.long(group_sizze_35152))) != 0):
          self.scan_stage1_35145_var.set_args(np.int32(sizze_31214),
                                              np.int32(sizze_31215),
                                              np.int32(sizze_31216),
                                              np.int32(num_groups_35162),
                                              images_mem_37894, res_mem_38199,
                                              mem_38270, mem_38274)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage1_35145_var,
                                     ((np.long(num_groups_35162) * np.long(group_sizze_35152)),),
                                     (np.long(group_sizze_35152),))
          if synchronous:
            self.queue.finish()
        if ((1 * (np.long(np.int32(1)) * np.long(num_groups_35162))) != 0):
          self.scan_stage2_38948_var.set_args(cl.LocalMemory(np.long((np.int32(4) * num_groups_35162))),
                                              np.int32(sizze_31214),
                                              np.int32(sizze_31215),
                                              np.int32(num_groups_35162),
                                              mem_38270)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage2_38948_var,
                                     ((np.long(np.int32(1)) * np.long(num_groups_35162)),),
                                     (np.long(num_groups_35162),))
          if synchronous:
            self.queue.finish()
        group_sizze_38964 = self.sizes["main.group_size_38964"]
        num_groups_38965 = squot32((((sizze_31215 * sizze_31214) + sext_i32_i32(group_sizze_38964)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_38964))
        if ((1 * (np.long(num_groups_38965) * np.long(group_sizze_38964))) != 0):
          self.scan_stage3_38961_var.set_args(np.int32(sizze_31214),
                                              np.int32(sizze_31215),
                                              np.int32(num_groups_35162),
                                              mem_38270)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage3_38961_var,
                                     ((np.long(num_groups_38965) * np.long(group_sizze_38964)),),
                                     (np.long(group_sizze_38964),))
          if synchronous:
            self.queue.finish()
        mem_38277 = opencl_alloc(self, bytes_38223, "mem_38277")
        group_sizze_38971 = self.sizes["main.group_size_38971"]
        num_groups_38972 = squot32(((sizze_31215 + sext_i32_i32(group_sizze_38971)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_38971))
        if ((1 * (np.long(num_groups_38972) * np.long(group_sizze_38971))) != 0):
          self.copy_38968_var.set_args(np.int32(sizze_31214),
                                       np.int32(sizze_31215), np.int32(i_31490),
                                       mem_38270, mem_38277)
          cl.enqueue_nd_range_kernel(self.queue, self.copy_38968_var,
                                     ((np.long(num_groups_38972) * np.long(group_sizze_38971)),),
                                     (np.long(group_sizze_38971),))
          if synchronous:
            self.queue.finish()
        mem_38281 = opencl_alloc(self, bytes_38150, "mem_38281")
        group_sizze_38976 = self.sizes["main.group_size_38976"]
        num_groups_38977 = squot32((((sizze_31215 * sizze_31214) + sext_i32_i32(group_sizze_38976)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_38976))
        if ((1 * (np.long(num_groups_38977) * np.long(group_sizze_38976))) != 0):
          self.replicate_38973_var.set_args(np.int32(sizze_31214),
                                            np.int32(sizze_31215), mem_38281)
          cl.enqueue_nd_range_kernel(self.queue, self.replicate_38973_var,
                                     ((np.long(num_groups_38977) * np.long(group_sizze_38976)),),
                                     (np.long(group_sizze_38976),))
          if synchronous:
            self.queue.finish()
        mem_38285 = opencl_alloc(self, bytes_38150, "mem_38285")
        group_sizze_38981 = self.sizes["main.group_size_38981"]
        num_groups_38982 = squot32((((sizze_31215 * sizze_31214) + sext_i32_i32(group_sizze_38981)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_38981))
        if ((1 * (np.long(num_groups_38982) * np.long(group_sizze_38981))) != 0):
          self.replicate_38978_var.set_args(np.int32(sizze_31214),
                                            np.int32(sizze_31215), mem_38285)
          cl.enqueue_nd_range_kernel(self.queue, self.replicate_38978_var,
                                     ((np.long(num_groups_38982) * np.long(group_sizze_38981)),),
                                     (np.long(group_sizze_38981),))
          if synchronous:
            self.queue.finish()
        group_sizze_35224 = self.sizes["main.group_size_35027"]
        y_35225 = (group_sizze_35224 - np.int32(1))
        x_35226 = (num_threads_34504 + y_35225)
        num_groups_35227 = squot32(x_35226, group_sizze_35224)
        num_threads_35228 = (group_sizze_35224 * num_groups_35227)
        if ((1 * (np.long(num_groups_35227) * np.long(group_sizze_35224))) != 0):
          self.map_35033_var.set_args(np.int32(sizze_31214),
                                      np.int32(sizze_31215), mem_38270,
                                      mem_38274, mem_38281, mem_38285)
          cl.enqueue_nd_range_kernel(self.queue, self.map_35033_var,
                                     ((np.long(num_groups_35227) * np.long(group_sizze_35224)),),
                                     (np.long(group_sizze_35224),))
          if synchronous:
            self.queue.finish()
        mem_38270 = None
        mem_38274 = None
        res_mem_38286 = mem_38277
        res_mem_38287 = mem_38281
        res_mem_38288 = mem_38285
      res_mem_38289 = res_mem_38286
      res_mem_38290 = res_mem_38287
      res_mem_38291 = res_mem_38288
    suff_outer_par_35248 = (self.sizes["main.suff_outer_par_33"] <= sizze_31215)
    num_threads_35255 = (sizze_31215 * n_31219)
    suff_intra_par_35386 = (self.sizes["main.suff_intra_par_34"] <= n_31219)
    intra_suff_and_fits_35389 = (fits_33049 and suff_intra_par_35386)
    if suff_outer_par_35248:
      group_sizze_35334 = self.sizes["main.group_size_35281"]
      y_35335 = (group_sizze_35334 - np.int32(1))
      x_35336 = (sizze_31215 + y_35335)
      num_groups_35337 = squot32(x_35336, group_sizze_35334)
      num_threads_35338 = (group_sizze_35334 * num_groups_35337)
      mem_38295 = opencl_alloc(self, bytes_37912, "mem_38295")
      self.futhark__map_transpose_f32(mem_38295, np.int32(0), images_mem_37894,
                                      np.int32(0), np.int32(1), sizze_31216,
                                      sizze_31215, (sizze_31215 * sizze_31216),
                                      (sizze_31215 * sizze_31216))
      mem_38299 = opencl_alloc(self, bytes_38150, "mem_38299")
      self.futhark__map_transpose_f32(mem_38299, np.int32(0), res_mem_38290,
                                      np.int32(0), np.int32(1), sizze_31214,
                                      sizze_31215, (sizze_31215 * sizze_31214),
                                      (sizze_31215 * sizze_31214))
      mem_38302 = opencl_alloc(self, bytes_38223, "mem_38302")
      mem_38305 = opencl_alloc(self, bytes_38223, "mem_38305")
      mem_38308 = opencl_alloc(self, bytes_38223, "mem_38308")
      if ((1 * (np.long(num_groups_35337) * np.long(group_sizze_35334))) != 0):
        self.map_35287_var.set_args(np.int32(sizze_31215), np.int32(n_31219),
                                    np.float32(hfrac_31221),
                                    np.int32(res_31235), mem_38295, mem_38299,
                                    mem_38302, mem_38305, mem_38308)
        cl.enqueue_nd_range_kernel(self.queue, self.map_35287_var,
                                   ((np.long(num_groups_35337) * np.long(group_sizze_35334)),),
                                   (np.long(group_sizze_35334),))
        if synchronous:
          self.queue.finish()
      mem_38295 = None
      mem_38299 = None
      res_mem_38339 = mem_38302
      res_mem_38340 = mem_38305
      res_mem_38341 = mem_38308
    else:
      if intra_suff_and_fits_35389:
        mem_38317 = opencl_alloc(self, bytes_38223, "mem_38317")
        mem_38320 = opencl_alloc(self, bytes_38223, "mem_38320")
        mem_38323 = opencl_alloc(self, bytes_38223, "mem_38323")
        binop_x_38310 = sext_i32_i64(n_31219)
        bytes_38309 = (np.int64(4) * binop_x_38310)
        if ((1 * (np.long(sizze_31215) * np.long(n_31219))) != 0):
          self.map_intra_group_35256_var.set_args(cl.LocalMemory(np.long(bytes_38309)),
                                                  cl.LocalMemory(np.long(bytes_38309)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(sizze_31216),
                                                  np.int32(n_31219),
                                                  np.float32(hfrac_31221),
                                                  np.int32(res_31235),
                                                  images_mem_37894,
                                                  res_mem_38290, mem_38317,
                                                  mem_38320, mem_38323)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_35256_var,
                                     ((np.long(sizze_31215) * np.long(n_31219)),),
                                     (np.long(n_31219),))
          if synchronous:
            self.queue.finish()
        res_mem_38336 = mem_38317
        res_mem_38337 = mem_38320
        res_mem_38338 = mem_38323
      else:
        total_num_elements_35497 = sext_i32_i64(num_threads_35255)
        group_sizze_35498 = self.sizes["main.group_size_35474"]
        max_num_groups_35499 = self.sizes["main.max_num_groups_35476"]
        group_sizze_35500 = sext_i32_i64(group_sizze_35498)
        max_num_groups_35501 = sext_i32_i64(max_num_groups_35499)
        y_35502 = (group_sizze_35500 - np.int64(1))
        x_35503 = (total_num_elements_35497 + y_35502)
        w_div_group_sizze_35504 = squot64(x_35503, group_sizze_35500)
        num_groups_maybe_zzero_35505 = smin64(max_num_groups_35501,
                                              w_div_group_sizze_35504)
        num_groups_35506 = smax64(np.int64(1), num_groups_maybe_zzero_35505)
        num_threads_35507 = (group_sizze_35500 * num_groups_35506)
        num_groups_35508 = sext_i64_i32(num_groups_35506)
        num_threads_35509 = sext_i64_i32(num_threads_35507)
        mem_38326 = opencl_alloc(self, bytes_38223, "mem_38326")
        if slt32((n_31219 * np.int32(2)), group_sizze_35498):
          segment_sizze_nonzzero_39004 = smax32(np.int32(1), n_31219)
          if ((1 * (np.long(num_groups_35508) * np.long(group_sizze_35498))) != 0):
            self.segred_small_35492_var.set_args(np.int32(sizze_31215),
                                                 np.int32(sizze_31216),
                                                 np.int32(n_31219),
                                                 np.int32(num_groups_35508),
                                                 images_mem_37894, mem_38326,
                                                 np.int32(segment_sizze_nonzzero_39004))
            cl.enqueue_nd_range_kernel(self.queue, self.segred_small_35492_var,
                                       ((np.long(num_groups_35508) * np.long(group_sizze_35498)),),
                                       (np.long(group_sizze_35498),))
            if synchronous:
              self.queue.finish()
        else:
          num_groups_39015 = (squot32(((num_groups_35508 + smax32(np.int32(1),
                                                                  sizze_31215)) - np.int32(1)),
                                      smax32(np.int32(1),
                                             sizze_31215)) * sizze_31215)
          num_threads_39016 = (num_groups_39015 * group_sizze_35498)
          thread_per_segment_39017 = (squot32(((num_groups_35508 + smax32(np.int32(1),
                                                                          sizze_31215)) - np.int32(1)),
                                              smax32(np.int32(1),
                                                     sizze_31215)) * group_sizze_35498)
          group_res_arr_mem_39018 = opencl_alloc(self,
                                                 (np.int32(4) * num_groups_39015),
                                                 "group_res_arr_mem_39018")
          counter_mem_39020 = self.counter_mem_39020
          if ((1 * (np.long(num_groups_39015) * np.long(group_sizze_35498))) != 0):
            self.segred_large_35492_var.set_args(np.int32(sizze_31215),
                                                 np.int32(sizze_31216),
                                                 np.int32(n_31219),
                                                 np.int32(num_groups_35508),
                                                 images_mem_37894, mem_38326,
                                                 np.int32(thread_per_segment_39017),
                                                 group_res_arr_mem_39018,
                                                 counter_mem_39020)
            cl.enqueue_nd_range_kernel(self.queue, self.segred_large_35492_var,
                                       ((np.long(num_groups_39015) * np.long(group_sizze_35498)),),
                                       (np.long(group_sizze_35498),))
            if synchronous:
              self.queue.finish()
        group_sizze_35526 = self.sizes["main.group_size_35449"]
        max_num_groups_35527 = self.sizes["main.max_num_groups_35451"]
        group_sizze_35528 = sext_i32_i64(group_sizze_35526)
        max_num_groups_35529 = sext_i32_i64(max_num_groups_35527)
        y_35530 = (group_sizze_35528 - np.int64(1))
        x_35531 = (total_num_elements_35497 + y_35530)
        w_div_group_sizze_35532 = squot64(x_35531, group_sizze_35528)
        num_groups_maybe_zzero_35533 = smin64(max_num_groups_35529,
                                              w_div_group_sizze_35532)
        num_groups_35534 = smax64(np.int64(1), num_groups_maybe_zzero_35533)
        num_threads_35535 = (group_sizze_35528 * num_groups_35534)
        num_groups_35536 = sext_i64_i32(num_groups_35534)
        num_threads_35537 = sext_i64_i32(num_threads_35535)
        mem_38329 = opencl_alloc(self, bytes_38223, "mem_38329")
        if slt32((n_31219 * np.int32(2)), group_sizze_35526):
          segment_sizze_nonzzero_39039 = smax32(np.int32(1), n_31219)
          if ((1 * (np.long(num_groups_35536) * np.long(group_sizze_35526))) != 0):
            self.segred_small_35467_var.set_args(np.int32(sizze_31214),
                                                 np.int32(sizze_31215),
                                                 np.int32(n_31219),
                                                 np.int32(num_groups_35536),
                                                 res_mem_38290, mem_38326,
                                                 mem_38329,
                                                 np.int32(segment_sizze_nonzzero_39039))
            cl.enqueue_nd_range_kernel(self.queue, self.segred_small_35467_var,
                                       ((np.long(num_groups_35536) * np.long(group_sizze_35526)),),
                                       (np.long(group_sizze_35526),))
            if synchronous:
              self.queue.finish()
        else:
          num_groups_39050 = (squot32(((num_groups_35536 + smax32(np.int32(1),
                                                                  sizze_31215)) - np.int32(1)),
                                      smax32(np.int32(1),
                                             sizze_31215)) * sizze_31215)
          num_threads_39051 = (num_groups_39050 * group_sizze_35526)
          thread_per_segment_39052 = (squot32(((num_groups_35536 + smax32(np.int32(1),
                                                                          sizze_31215)) - np.int32(1)),
                                              smax32(np.int32(1),
                                                     sizze_31215)) * group_sizze_35526)
          group_res_arr_mem_39053 = opencl_alloc(self,
                                                 (np.int32(4) * num_groups_39050),
                                                 "group_res_arr_mem_39053")
          counter_mem_39055 = self.counter_mem_39055
          if ((1 * (np.long(num_groups_39050) * np.long(group_sizze_35526))) != 0):
            self.segred_large_35467_var.set_args(np.int32(sizze_31214),
                                                 np.int32(sizze_31215),
                                                 np.int32(n_31219),
                                                 np.int32(num_groups_35536),
                                                 res_mem_38290, mem_38326,
                                                 mem_38329,
                                                 np.int32(thread_per_segment_39052),
                                                 group_res_arr_mem_39053,
                                                 counter_mem_39055)
            cl.enqueue_nd_range_kernel(self.queue, self.segred_large_35467_var,
                                       ((np.long(num_groups_39050) * np.long(group_sizze_35526)),),
                                       (np.long(group_sizze_35526),))
            if synchronous:
              self.queue.finish()
        group_sizze_35552 = self.sizes["main.group_size_35426"]
        y_35553 = (group_sizze_35552 - np.int32(1))
        x_35554 = (sizze_31215 + y_35553)
        num_groups_35555 = squot32(x_35554, group_sizze_35552)
        num_threads_35556 = (group_sizze_35552 * num_groups_35555)
        mem_38332 = opencl_alloc(self, bytes_38223, "mem_38332")
        mem_38335 = opencl_alloc(self, bytes_38223, "mem_38335")
        if ((1 * (np.long(num_groups_35555) * np.long(group_sizze_35552))) != 0):
          self.map_35432_var.set_args(np.int32(sizze_31215),
                                      np.float32(hfrac_31221),
                                      np.int32(res_31235), mem_38326, mem_38329,
                                      mem_38332, mem_38335)
          cl.enqueue_nd_range_kernel(self.queue, self.map_35432_var,
                                     ((np.long(num_groups_35555) * np.long(group_sizze_35552)),),
                                     (np.long(group_sizze_35552),))
          if synchronous:
            self.queue.finish()
        mem_38329 = None
        res_mem_38336 = mem_38332
        res_mem_38337 = mem_38326
        res_mem_38338 = mem_38335
      res_mem_38339 = res_mem_38336
      res_mem_38340 = res_mem_38337
      res_mem_38341 = res_mem_38338
    group_sizze_35578 = self.sizes["main.group_size_35577"]
    max_num_groups_35580 = self.sizes["main.max_num_groups_35579"]
    group_sizze_35581 = sext_i32_i64(group_sizze_35578)
    max_num_groups_35582 = sext_i32_i64(max_num_groups_35580)
    y_35583 = (group_sizze_35581 - np.int64(1))
    x_35584 = (y_35583 + binop_x_38224)
    w_div_group_sizze_35585 = squot64(x_35584, group_sizze_35581)
    num_groups_maybe_zzero_35586 = smin64(max_num_groups_35582,
                                          w_div_group_sizze_35585)
    num_groups_35587 = smax64(np.int64(1), num_groups_maybe_zzero_35586)
    num_threads_35588 = (group_sizze_35581 * num_groups_35587)
    num_groups_35589 = sext_i64_i32(num_groups_35587)
    num_threads_35590 = sext_i64_i32(num_threads_35588)
    mem_38344 = opencl_alloc(self, np.int64(4), "mem_38344")
    counter_mem_39076 = self.counter_mem_39076
    group_res_arr_mem_39078 = opencl_alloc(self,
                                           (np.int32(4) * num_groups_35589),
                                           "group_res_arr_mem_39078")
    num_threads_39080 = (group_sizze_35578 * num_groups_35589)
    if ((1 * (np.long(num_groups_35589) * np.long(group_sizze_35578))) != 0):
      self.segred_nonseg_35595_var.set_args(np.int32(sizze_31215),
                                            np.int32(num_groups_35589),
                                            res_mem_38339, mem_38344,
                                            counter_mem_39076,
                                            group_res_arr_mem_39078,
                                            np.int32(num_threads_39080))
      cl.enqueue_nd_range_kernel(self.queue, self.segred_nonseg_35595_var,
                                 ((np.long(num_groups_35589) * np.long(group_sizze_35578)),),
                                 (np.long(group_sizze_35578),))
      if synchronous:
        self.queue.finish()
    read_res_39312 = np.empty(1, dtype=ct.c_int32)
    cl.enqueue_copy(self.queue, read_res_39312, mem_38344,
                    device_offset=np.long(np.int32(0)), is_blocking=True)
    res_31594 = read_res_39312[0]
    mem_38344 = None
    suff_outer_par_35598 = (self.sizes["main.suff_outer_par_35"] <= sizze_31215)
    num_threads_35603 = (sizze_31215 * res_31594)
    fits_35681 = sle32(res_31594, max_group_sizze_32576)
    suff_intra_par_35679 = (self.sizes["main.suff_intra_par_36"] <= res_31594)
    intra_suff_and_fits_35682 = (suff_intra_par_35679 and fits_35681)
    if suff_outer_par_35598:
      group_sizze_35650 = self.sizes["main.group_size_35618"]
      y_35651 = (group_sizze_35650 - np.int32(1))
      x_35652 = (sizze_31215 + y_35651)
      num_groups_35653 = squot32(x_35652, group_sizze_35650)
      num_threads_35654 = (group_sizze_35650 * num_groups_35653)
      mem_38347 = opencl_alloc(self, bytes_38223, "mem_38347")
      if ((1 * (np.long(num_groups_35653) * np.long(group_sizze_35650))) != 0):
        self.map_35624_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(res_31594),
                                    res_mem_38290, res_mem_38339, res_mem_38340,
                                    mem_38347)
        cl.enqueue_nd_range_kernel(self.queue, self.map_35624_var,
                                   ((np.long(num_groups_35653) * np.long(group_sizze_35650)),),
                                   (np.long(group_sizze_35650),))
        if synchronous:
          self.queue.finish()
      res_mem_38358 = mem_38347
    else:
      if intra_suff_and_fits_35682:
        mem_38353 = opencl_alloc(self, bytes_38223, "mem_38353")
        binop_x_38349 = sext_i32_i64(res_31594)
        bytes_38348 = (np.int64(4) * binop_x_38349)
        if ((1 * (np.long(sizze_31215) * np.long(res_31594))) != 0):
          self.map_intra_group_35604_var.set_args(cl.LocalMemory(np.long(bytes_38348)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(res_31594),
                                                  res_mem_38290, res_mem_38339,
                                                  res_mem_38340, mem_38353)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_35604_var,
                                     ((np.long(sizze_31215) * np.long(res_31594)),),
                                     (np.long(res_31594),))
          if synchronous:
            self.queue.finish()
        res_mem_38357 = mem_38353
      else:
        total_num_elements_35729 = sext_i32_i64(num_threads_35603)
        group_sizze_35730 = self.sizes["main.group_size_35705"]
        max_num_groups_35731 = self.sizes["main.max_num_groups_35707"]
        group_sizze_35732 = sext_i32_i64(group_sizze_35730)
        max_num_groups_35733 = sext_i32_i64(max_num_groups_35731)
        y_35734 = (group_sizze_35732 - np.int64(1))
        x_35735 = (total_num_elements_35729 + y_35734)
        w_div_group_sizze_35736 = squot64(x_35735, group_sizze_35732)
        num_groups_maybe_zzero_35737 = smin64(max_num_groups_35733,
                                              w_div_group_sizze_35736)
        num_groups_35738 = smax64(np.int64(1), num_groups_maybe_zzero_35737)
        num_threads_35739 = (group_sizze_35732 * num_groups_35738)
        num_groups_35740 = sext_i64_i32(num_groups_35738)
        num_threads_35741 = sext_i64_i32(num_threads_35739)
        mem_38356 = opencl_alloc(self, bytes_38223, "mem_38356")
        if slt32((res_31594 * np.int32(2)), group_sizze_35730):
          segment_sizze_nonzzero_39108 = smax32(np.int32(1), res_31594)
          if ((1 * (np.long(num_groups_35740) * np.long(group_sizze_35730))) != 0):
            self.segred_small_35723_var.set_args(np.int32(sizze_31214),
                                                 np.int32(sizze_31215),
                                                 np.int32(res_31594),
                                                 np.int32(num_groups_35740),
                                                 res_mem_38290, res_mem_38339,
                                                 res_mem_38340, mem_38356,
                                                 np.int32(segment_sizze_nonzzero_39108))
            cl.enqueue_nd_range_kernel(self.queue, self.segred_small_35723_var,
                                       ((np.long(num_groups_35740) * np.long(group_sizze_35730)),),
                                       (np.long(group_sizze_35730),))
            if synchronous:
              self.queue.finish()
        else:
          num_groups_39119 = (squot32(((num_groups_35740 + smax32(np.int32(1),
                                                                  sizze_31215)) - np.int32(1)),
                                      smax32(np.int32(1),
                                             sizze_31215)) * sizze_31215)
          num_threads_39120 = (num_groups_39119 * group_sizze_35730)
          thread_per_segment_39121 = (squot32(((num_groups_35740 + smax32(np.int32(1),
                                                                          sizze_31215)) - np.int32(1)),
                                              smax32(np.int32(1),
                                                     sizze_31215)) * group_sizze_35730)
          group_res_arr_mem_39122 = opencl_alloc(self,
                                                 (np.int32(4) * num_groups_39119),
                                                 "group_res_arr_mem_39122")
          counter_mem_39124 = self.counter_mem_39124
          if ((1 * (np.long(num_groups_39119) * np.long(group_sizze_35730))) != 0):
            self.segred_large_35723_var.set_args(np.int32(sizze_31214),
                                                 np.int32(sizze_31215),
                                                 np.int32(res_31594),
                                                 np.int32(num_groups_35740),
                                                 res_mem_38290, res_mem_38339,
                                                 res_mem_38340, mem_38356,
                                                 np.int32(thread_per_segment_39121),
                                                 group_res_arr_mem_39122,
                                                 counter_mem_39124)
            cl.enqueue_nd_range_kernel(self.queue, self.segred_large_35723_var,
                                       ((np.long(num_groups_39119) * np.long(group_sizze_35730)),),
                                       (np.long(group_sizze_35730),))
            if synchronous:
              self.queue.finish()
        res_mem_38357 = mem_38356
      res_mem_38358 = res_mem_38357
    arg_31616 = (sizze_31214 - n_31219)
    bounds_invalid_upwards_31617 = slt32(arg_31616, np.int32(0))
    eq_x_zz_31618 = (np.int32(0) == arg_31616)
    not_p_31619 = not(bounds_invalid_upwards_31617)
    p_and_eq_x_y_31620 = (eq_x_zz_31618 and not_p_31619)
    dim_zzero_31621 = (bounds_invalid_upwards_31617 or p_and_eq_x_y_31620)
    both_empty_31622 = (eq_x_zz_31618 and dim_zzero_31621)
    eq_x_y_31623 = (arg_31616 == np.int32(0))
    p_and_eq_x_y_31624 = (bounds_invalid_upwards_31617 and eq_x_y_31623)
    dim_match_31625 = (not_p_31619 or p_and_eq_x_y_31624)
    empty_or_match_31626 = (both_empty_31622 or dim_match_31625)
    empty_or_match_cert_31627 = True
    assert empty_or_match_31626, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:204:22-31 -> /futlib/array.fut:61:1-62:12: %s%s%s%d%s%s" % ("Function return value does not match shape of type ",
                                                                                                                                                                      "*",
                                                                                                                                                                      "[",
                                                                                                                                                                      arg_31616,
                                                                                                                                                                      "]",
                                                                                                                                                                      "intrinsics.i32"))
    x_31629 = (np.int32(1) + n_31219)
    index_certs_31630 = True
    assert x_31491, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:200:15-204:32 -> bfastfinaldetailed.fut:202:63-81: %s%d%s%d%s" % ("Index [",
                                                                                                                                                               i_31490,
                                                                                                                                                               "] out of bounds for array of shape [",
                                                                                                                                                               sizze_31214,
                                                                                                                                                               "]."))
    read_res_39314 = np.empty(1, dtype=ct.c_int32)
    cl.enqueue_copy(self.queue, read_res_39314, mappingindices_mem_37893,
                    device_offset=np.long((i_31490 * np.int32(4))),
                    is_blocking=True)
    arg_31631 = read_res_39314[0]
    res_31632 = sitofp_i32_f32(arg_31631)
    group_sizze_35821 = self.sizes["main.group_size_35801"]
    y_35822 = (group_sizze_35821 - np.int32(1))
    x_35823 = (arg_31616 + y_35822)
    num_groups_35824 = squot32(x_35823, group_sizze_35821)
    num_threads_35825 = (group_sizze_35821 * num_groups_35824)
    binop_x_38360 = sext_i32_i64(arg_31616)
    bytes_38359 = (np.int64(4) * binop_x_38360)
    mem_38361 = opencl_alloc(self, bytes_38359, "mem_38361")
    if ((1 * (np.long(num_groups_35824) * np.long(group_sizze_35821))) != 0):
      self.map_35807_var.set_args(np.float32(lam_31222), np.int32(arg_31616),
                                  np.int32(x_31629), np.float32(res_31632),
                                  mappingindices_mem_37893, mem_38361)
      cl.enqueue_nd_range_kernel(self.queue, self.map_35807_var,
                                 ((np.long(num_groups_35824) * np.long(group_sizze_35821)),),
                                 (np.long(group_sizze_35821),))
      if synchronous:
        self.queue.finish()
    empty_or_match_cert_31645 = True
    assert empty_or_match_31626, ("Error at bfastfinaldetailed.fut:112:1-240:86 -> bfastfinaldetailed.fut:209:38-238:9 -> /futlib/functional.fut:7:42-44 -> bfastfinaldetailed.fut:236:33-53 -> /futlib/array.fut:66:1-67:19: %s%s%s%d%s%s" % ("Function return value does not match shape of type ",
                                                                                                                                                                                                                                               "*",
                                                                                                                                                                                                                                               "[",
                                                                                                                                                                                                                                               arg_31616,
                                                                                                                                                                                                                                               "]",
                                                                                                                                                                                                                                               "t"))
    suff_outer_par_35838 = (self.sizes["main.suff_outer_par_38"] <= sizze_31215)
    num_threads_35850 = (sizze_31215 * arg_31616)
    fits_36166 = sle32(arg_31616, max_group_sizze_32576)
    suff_intra_par_36164 = (self.sizes["main.suff_intra_par_39"] <= arg_31616)
    intra_suff_and_fits_36167 = (suff_intra_par_36164 and fits_36166)
    binop_x_38379 = sext_i32_i64(num_threads_35850)
    bytes_38377 = (np.int64(4) * binop_x_38379)
    if suff_outer_par_35838:
      group_sizze_36025 = self.sizes["main.group_size_35886"]
      y_36026 = (group_sizze_36025 - np.int32(1))
      x_36027 = (sizze_31215 + y_36026)
      num_groups_36028 = squot32(x_36027, group_sizze_36025)
      num_threads_36029 = (group_sizze_36025 * num_groups_36028)
      mem_38380 = opencl_alloc(self, bytes_38377, "mem_38380")
      mem_38384 = opencl_alloc(self, bytes_38377, "mem_38384")
      mem_38387 = opencl_alloc(self, bytes_38223, "mem_38387")
      mem_38390 = opencl_alloc(self, bytes_38223, "mem_38390")
      binop_x_38366 = sext_i32_i64(group_sizze_36025)
      bytes_38365 = (np.int64(4) * binop_x_38366)
      num_threads64_38532 = sext_i32_i64(num_threads_36029)
      total_sizze_38533 = (bytes_38359 * num_threads64_38532)
      mem_38364 = opencl_alloc(self, total_sizze_38533, "mem_38364")
      total_sizze_38534 = (bytes_38365 * num_threads64_38532)
      mem_38370 = opencl_alloc(self, total_sizze_38534, "mem_38370")
      total_sizze_38535 = (bytes_38365 * num_threads64_38532)
      mem_38373 = opencl_alloc(self, total_sizze_38535, "mem_38373")
      total_sizze_38536 = (bytes_38359 * num_threads64_38532)
      mem_38376 = opencl_alloc(self, total_sizze_38536, "mem_38376")
      if ((1 * (np.long(num_groups_36028) * np.long(group_sizze_36025))) != 0):
        self.map_35892_var.set_args(np.int32(sizze_31214),
                                    np.int32(sizze_31215), np.int32(n_31219),
                                    np.int32(arg_31616), res_mem_38289,
                                    res_mem_38290, res_mem_38291, res_mem_38339,
                                    res_mem_38340, res_mem_38341, res_mem_38358,
                                    mem_38361, mem_38364, mem_38370, mem_38373,
                                    mem_38376, mem_38380, mem_38384, mem_38387,
                                    mem_38390)
        cl.enqueue_nd_range_kernel(self.queue, self.map_35892_var,
                                   ((np.long(num_groups_36028) * np.long(group_sizze_36025)),),
                                   (np.long(group_sizze_36025),))
        if synchronous:
          self.queue.finish()
      mem_38364 = None
      mem_38370 = None
      mem_38373 = None
      mem_38376 = None
      mem_38394 = opencl_alloc(self, bytes_38377, "mem_38394")
      self.futhark__map_transpose_f32(mem_38394, np.int32(0), mem_38380,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      arg_31616, (sizze_31215 * arg_31616),
                                      (sizze_31215 * arg_31616))
      mem_38380 = None
      mem_38399 = opencl_alloc(self, bytes_38377, "mem_38399")
      self.futhark__map_transpose_f32(mem_38399, np.int32(0), mem_38384,
                                      np.int32(0), np.int32(1), sizze_31215,
                                      arg_31616, (sizze_31215 * arg_31616),
                                      (sizze_31215 * arg_31616))
      mem_38384 = None
      res_mem_38471 = mem_38394
      res_mem_38472 = mem_38399
      res_mem_38473 = mem_38387
      res_mem_38474 = mem_38390
    else:
      if intra_suff_and_fits_36167:
        mem_38421 = opencl_alloc(self, bytes_38377, "mem_38421")
        mem_38425 = opencl_alloc(self, bytes_38377, "mem_38425")
        mem_38428 = opencl_alloc(self, bytes_38223, "mem_38428")
        mem_38431 = opencl_alloc(self, bytes_38223, "mem_38431")
        if ((1 * (np.long(sizze_31215) * np.long(arg_31616))) != 0):
          self.map_intra_group_35851_var.set_args(cl.LocalMemory(np.long(bytes_38359)),
                                                  cl.LocalMemory(np.long(binop_x_38360)),
                                                  cl.LocalMemory(np.long(bytes_38359)),
                                                  cl.LocalMemory(np.long(bytes_38359)),
                                                  cl.LocalMemory(np.long(bytes_38359)),
                                                  cl.LocalMemory(np.long(bytes_38359)),
                                                  np.int32(sizze_31214),
                                                  np.int32(sizze_31215),
                                                  np.int32(n_31219),
                                                  np.int32(arg_31616),
                                                  res_mem_38289, res_mem_38290,
                                                  res_mem_38291, res_mem_38339,
                                                  res_mem_38340, res_mem_38341,
                                                  res_mem_38358, mem_38361,
                                                  mem_38421, mem_38425,
                                                  mem_38428, mem_38431)
          cl.enqueue_nd_range_kernel(self.queue, self.map_intra_group_35851_var,
                                     ((np.long(sizze_31215) * np.long(arg_31616)),),
                                     (np.long(arg_31616),))
          if synchronous:
            self.queue.finish()
        res_mem_38467 = mem_38421
        res_mem_38468 = mem_38425
        res_mem_38469 = mem_38428
        res_mem_38470 = mem_38431
      else:
        group_sizze_36502 = self.sizes["main.group_size_36479"]
        y_36503 = (group_sizze_36502 - np.int32(1))
        x_36504 = (sizze_31215 + y_36503)
        num_groups_36505 = squot32(x_36504, group_sizze_36502)
        num_threads_36506 = (group_sizze_36502 * num_groups_36505)
        mem_38434 = opencl_alloc(self, bytes_38223, "mem_38434")
        mem_38437 = opencl_alloc(self, bytes_38223, "mem_38437")
        if ((1 * (np.long(num_groups_36505) * np.long(group_sizze_36502))) != 0):
          self.map_36485_var.set_args(np.int32(sizze_31215), res_mem_38289,
                                      res_mem_38340, res_mem_38341, mem_38434,
                                      mem_38437)
          cl.enqueue_nd_range_kernel(self.queue, self.map_36485_var,
                                     ((np.long(num_groups_36505) * np.long(group_sizze_36502)),),
                                     (np.long(group_sizze_36502),))
          if synchronous:
            self.queue.finish()
        group_sizze_36528 = self.sizes["main.group_size_36446"]
        max_num_groups_36529 = self.sizes["main.max_num_groups_36448"]
        group_sizze_36530 = sext_i32_i64(group_sizze_36528)
        max_num_groups_36531 = sext_i32_i64(max_num_groups_36529)
        y_36532 = (group_sizze_36530 - np.int64(1))
        x_36533 = (y_36532 + binop_x_38379)
        w_div_group_sizze_36534 = squot64(x_36533, group_sizze_36530)
        num_groups_maybe_zzero_36535 = smin64(max_num_groups_36531,
                                              w_div_group_sizze_36534)
        num_groups_36536 = smax64(np.int64(1), num_groups_maybe_zzero_36535)
        num_threads_36537 = (group_sizze_36530 * num_groups_36536)
        num_groups_36538 = sext_i64_i32(num_groups_36536)
        num_threads_36539 = sext_i64_i32(num_threads_36537)
        mem_38441 = opencl_alloc(self, bytes_38377, "mem_38441")
        if ((1 * (np.long(num_groups_36538) * np.long(group_sizze_36528))) != 0):
          self.scan_stage1_36464_var.set_args(np.int32(sizze_31214),
                                              np.int32(sizze_31215),
                                              np.int32(arg_31616),
                                              np.int32(num_groups_36538),
                                              res_mem_38290, res_mem_38339,
                                              res_mem_38340, res_mem_38358,
                                              mem_38437, mem_38441)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage1_36464_var,
                                     ((np.long(num_groups_36538) * np.long(group_sizze_36528)),),
                                     (np.long(group_sizze_36528),))
          if synchronous:
            self.queue.finish()
        if ((1 * (np.long(np.int32(1)) * np.long(num_groups_36538))) != 0):
          self.scan_stage2_39203_var.set_args(cl.LocalMemory(np.long((np.int32(4) * num_groups_36538))),
                                              np.int32(sizze_31215),
                                              np.int32(arg_31616),
                                              np.int32(num_groups_36538),
                                              mem_38441)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage2_39203_var,
                                     ((np.long(np.int32(1)) * np.long(num_groups_36538)),),
                                     (np.long(num_groups_36538),))
          if synchronous:
            self.queue.finish()
        group_sizze_39219 = self.sizes["main.group_size_39219"]
        num_groups_39220 = squot32((((sizze_31215 * arg_31616) + sext_i32_i32(group_sizze_39219)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_39219))
        if ((1 * (np.long(num_groups_39220) * np.long(group_sizze_39219))) != 0):
          self.scan_stage3_39216_var.set_args(np.int32(sizze_31215),
                                              np.int32(arg_31616),
                                              np.int32(num_groups_36538),
                                              mem_38441)
          cl.enqueue_nd_range_kernel(self.queue, self.scan_stage3_39216_var,
                                     ((np.long(num_groups_39220) * np.long(group_sizze_39219)),),
                                     (np.long(group_sizze_39219),))
          if synchronous:
            self.queue.finish()
        group_sizze_36582 = self.sizes["main.group_size_36405"]
        max_num_groups_36583 = self.sizes["main.max_num_groups_36407"]
        group_sizze_36584 = sext_i32_i64(group_sizze_36582)
        max_num_groups_36585 = sext_i32_i64(max_num_groups_36583)
        y_36586 = (group_sizze_36584 - np.int64(1))
        x_36587 = (y_36586 + binop_x_38379)
        w_div_group_sizze_36588 = squot64(x_36587, group_sizze_36584)
        num_groups_maybe_zzero_36589 = smin64(max_num_groups_36585,
                                              w_div_group_sizze_36588)
        num_groups_36590 = smax64(np.int64(1), num_groups_maybe_zzero_36589)
        num_threads_36591 = (group_sizze_36584 * num_groups_36590)
        num_groups_36592 = sext_i64_i32(num_groups_36590)
        num_threads_36593 = sext_i64_i32(num_threads_36591)
        mem_38443 = opencl_alloc(self, binop_x_38224, "mem_38443")
        mem_38446 = opencl_alloc(self, bytes_38223, "mem_38446")
        mem_38449 = opencl_alloc(self, bytes_38223, "mem_38449")
        mem_38453 = opencl_alloc(self, bytes_38377, "mem_38453")
        if slt32((arg_31616 * np.int32(2)), group_sizze_36582):
          segment_sizze_nonzzero_39225 = smax32(np.int32(1), arg_31616)
          if ((1 * (np.long(num_groups_36592) * np.long(group_sizze_36582))) != 0):
            self.segred_small_36423_var.set_args(np.int32(sizze_31215),
                                                 np.int32(arg_31616),
                                                 np.int32(num_groups_36592),
                                                 mem_38361, mem_38434,
                                                 mem_38437, mem_38441,
                                                 mem_38443, mem_38446,
                                                 mem_38449, mem_38453,
                                                 np.int32(segment_sizze_nonzzero_39225))
            cl.enqueue_nd_range_kernel(self.queue, self.segred_small_36423_var,
                                       ((np.long(num_groups_36592) * np.long(group_sizze_36582)),),
                                       (np.long(group_sizze_36582),))
            if synchronous:
              self.queue.finish()
        else:
          num_groups_39250 = (squot32(((num_groups_36592 + smax32(np.int32(1),
                                                                  sizze_31215)) - np.int32(1)),
                                      smax32(np.int32(1),
                                             sizze_31215)) * sizze_31215)
          num_threads_39251 = (num_groups_39250 * group_sizze_36582)
          thread_per_segment_39252 = (squot32(((num_groups_36592 + smax32(np.int32(1),
                                                                          sizze_31215)) - np.int32(1)),
                                              smax32(np.int32(1),
                                                     sizze_31215)) * group_sizze_36582)
          group_res_arr_mem_39253 = opencl_alloc(self,
                                                 (np.int32(1) * num_groups_39250),
                                                 "group_res_arr_mem_39253")
          group_res_arr_mem_39255 = opencl_alloc(self,
                                                 (np.int32(4) * num_groups_39250),
                                                 "group_res_arr_mem_39255")
          group_res_arr_mem_39257 = opencl_alloc(self,
                                                 (np.int32(4) * num_groups_39250),
                                                 "group_res_arr_mem_39257")
          counter_mem_39259 = self.counter_mem_39259
          if ((1 * (np.long(num_groups_39250) * np.long(group_sizze_36582))) != 0):
            self.segred_large_36423_var.set_args(np.int32(sizze_31215),
                                                 np.int32(arg_31616),
                                                 np.int32(num_groups_36592),
                                                 mem_38361, mem_38434,
                                                 mem_38437, mem_38441,
                                                 mem_38443, mem_38446,
                                                 mem_38449, mem_38453,
                                                 group_res_arr_mem_39253,
                                                 group_res_arr_mem_39255,
                                                 group_res_arr_mem_39257,
                                                 counter_mem_39259)
            cl.enqueue_nd_range_kernel(self.queue, self.segred_large_36423_var,
                                       ((np.long(num_groups_39250) * np.long(group_sizze_36582)),),
                                       (np.long(group_sizze_36582),))
            if synchronous:
              self.queue.finish()
        mem_38434 = None
        mem_38441 = None
        group_sizze_36632 = self.sizes["main.group_size_36371"]
        y_36633 = (group_sizze_36632 - np.int32(1))
        x_36634 = (sizze_31215 + y_36633)
        num_groups_36635 = squot32(x_36634, group_sizze_36632)
        num_threads_36636 = (group_sizze_36632 * num_groups_36635)
        mem_38456 = opencl_alloc(self, bytes_38223, "mem_38456")
        if ((sizze_31215 * np.int32(4)) != 0):
          cl.enqueue_copy(self.queue, mem_38456, mem_38449,
                          dest_offset=np.long(np.int32(0)),
                          src_offset=np.long(np.int32(0)),
                          byte_count=np.long((sizze_31215 * np.int32(4))))
        if synchronous:
          self.queue.finish()
        mem_38449 = None
        mem_38459 = opencl_alloc(self, bytes_38223, "mem_38459")
        if ((1 * (np.long(num_groups_36635) * np.long(group_sizze_36632))) != 0):
          self.map_36377_var.set_args(np.int32(sizze_31215), mem_38443,
                                      mem_38446, mem_38459)
          cl.enqueue_nd_range_kernel(self.queue, self.map_36377_var,
                                     ((np.long(num_groups_36635) * np.long(group_sizze_36632)),),
                                     (np.long(group_sizze_36632),))
          if synchronous:
            self.queue.finish()
        mem_38446 = None
        group_sizze_36656 = self.sizes["main.group_size_36338"]
        y_36657 = (group_sizze_36656 - np.int32(1))
        x_36658 = (sizze_31215 + y_36657)
        num_groups_36659 = squot32(x_36658, group_sizze_36656)
        num_threads_36660 = (group_sizze_36656 * num_groups_36659)
        mem_38462 = opencl_alloc(self, bytes_38223, "mem_38462")
        if ((1 * (np.long(num_groups_36659) * np.long(group_sizze_36656))) != 0):
          self.map_36344_var.set_args(np.int32(sizze_31214),
                                      np.int32(sizze_31215), np.int32(n_31219),
                                      res_mem_38291, res_mem_38340, mem_38437,
                                      mem_38443, mem_38459, mem_38462)
          cl.enqueue_nd_range_kernel(self.queue, self.map_36344_var,
                                     ((np.long(num_groups_36659) * np.long(group_sizze_36656)),),
                                     (np.long(group_sizze_36656),))
          if synchronous:
            self.queue.finish()
        mem_38443 = None
        mem_38459 = None
        mem_38466 = opencl_alloc(self, bytes_38377, "mem_38466")
        group_sizze_39299 = self.sizes["main.group_size_39299"]
        num_groups_39300 = squot32((((sizze_31215 * arg_31616) + sext_i32_i32(group_sizze_39299)) - np.int32(1)),
                                   sext_i32_i32(group_sizze_39299))
        if ((1 * (np.long(num_groups_39300) * np.long(group_sizze_39299))) != 0):
          self.replicate_39296_var.set_args(np.int32(sizze_31215),
                                            np.int32(arg_31616), mem_38466)
          cl.enqueue_nd_range_kernel(self.queue, self.replicate_39296_var,
                                     ((np.long(num_groups_39300) * np.long(group_sizze_39299)),),
                                     (np.long(group_sizze_39299),))
          if synchronous:
            self.queue.finish()
        group_sizze_36697 = self.sizes["main.group_size_36289"]
        y_36698 = (group_sizze_36697 - np.int32(1))
        x_36699 = (num_threads_35850 + y_36698)
        num_groups_36700 = squot32(x_36699, group_sizze_36697)
        num_threads_36701 = (group_sizze_36697 * num_groups_36700)
        if ((1 * (np.long(num_groups_36700) * np.long(group_sizze_36697))) != 0):
          self.map_36295_var.set_args(np.int32(sizze_31214),
                                      np.int32(sizze_31215), np.int32(n_31219),
                                      np.int32(arg_31616), res_mem_38291,
                                      res_mem_38340, mem_38437, mem_38453,
                                      mem_38466)
          cl.enqueue_nd_range_kernel(self.queue, self.map_36295_var,
                                     ((np.long(num_groups_36700) * np.long(group_sizze_36697)),),
                                     (np.long(group_sizze_36697),))
          if synchronous:
            self.queue.finish()
        mem_38437 = None
        res_mem_38467 = mem_38466
        res_mem_38468 = mem_38453
        res_mem_38469 = mem_38462
        res_mem_38470 = mem_38456
      res_mem_38471 = res_mem_38467
      res_mem_38472 = res_mem_38468
      res_mem_38473 = res_mem_38469
      res_mem_38474 = res_mem_38470
    res_mem_38291 = None
    res_mem_38339 = None
    out_arrsizze_38554 = sizze_31215
    out_arrsizze_38556 = sizze_31215
    out_arrsizze_38558 = sizze_31215
    out_arrsizze_38560 = sizze_31215
    out_arrsizze_38562 = sizze_31215
    out_arrsizze_38563 = arg_31616
    out_arrsizze_38565 = sizze_31215
    out_arrsizze_38566 = arg_31616
    out_arrsizze_38568 = arg_31616
    out_arrsizze_38570 = sizze_31215
    out_arrsizze_38572 = sizze_31215
    out_arrsizze_38574 = sizze_31215
    out_arrsizze_38575 = sizze_31214
    out_arrsizze_38577 = sizze_31215
    out_arrsizze_38578 = sizze_31214
    out_mem_38553 = res_mem_38358
    out_mem_38555 = res_mem_38289
    out_mem_38557 = res_mem_38340
    out_mem_38559 = res_mem_38341
    out_mem_38561 = res_mem_38471
    out_mem_38564 = res_mem_38472
    out_mem_38567 = mem_38361
    out_mem_38569 = res_mem_38473
    out_mem_38571 = res_mem_38474
    out_mem_38573 = res_mem_38290
    out_mem_38576 = res_mem_38199
    return (out_mem_38553, out_arrsizze_38554, out_mem_38555,
            out_arrsizze_38556, out_mem_38557, out_arrsizze_38558,
            out_mem_38559, out_arrsizze_38560, out_mem_38561,
            out_arrsizze_38562, out_arrsizze_38563, out_mem_38564,
            out_arrsizze_38565, out_arrsizze_38566, out_mem_38567,
            out_arrsizze_38568, out_mem_38569, out_arrsizze_38570,
            out_mem_38571, out_arrsizze_38572, out_mem_38573,
            out_arrsizze_38574, out_arrsizze_38575, out_mem_38576,
            out_arrsizze_38577, out_arrsizze_38578)
  def futhark__map_transpose_i32(self, destmem_0, destoffset_1, srcmem_2,
                                 srcoffset_3, num_arrays_4, x_elems_5,
                                 y_elems_6, in_elems_7, out_elems_8):
    if ((num_arrays_4 == np.int32(0)) or ((x_elems_5 == np.int32(0)) or (y_elems_6 == np.int32(0)))):
      pass
    else:
      muly_10 = squot32(np.int32(16), x_elems_5)
      mulx_9 = squot32(np.int32(16), y_elems_6)
      if ((in_elems_7 == out_elems_8) and (((num_arrays_4 == np.int32(1)) or ((x_elems_5 * y_elems_6) == in_elems_7)) and ((x_elems_5 == np.int32(1)) or (y_elems_6 == np.int32(1))))):
        if ((in_elems_7 * np.int32(4)) != 0):
          cl.enqueue_copy(self.queue, destmem_0, srcmem_2,
                          dest_offset=np.long(destoffset_1),
                          src_offset=np.long(srcoffset_3),
                          byte_count=np.long((in_elems_7 * np.int32(4))))
        if synchronous:
          self.queue.finish()
      else:
        if (sle32(x_elems_5, np.int32(8)) and slt32(np.int32(16), y_elems_6)):
          if ((((1 * (np.long(squot32(((x_elems_5 + np.int32(16)) - np.int32(1)),
                                      np.int32(16))) * np.long(np.int32(16)))) * (np.long(squot32(((squot32(((y_elems_6 + muly_10) - np.int32(1)),
                                                                                                            muly_10) + np.int32(16)) - np.int32(1)),
                                                                                                  np.int32(16))) * np.long(np.int32(16)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
            self.map_transpose_i32_low_width_var.set_args(np.int32(destoffset_1),
                                                          np.int32(srcoffset_3),
                                                          np.int32(num_arrays_4),
                                                          np.int32(x_elems_5),
                                                          np.int32(y_elems_6),
                                                          np.int32(in_elems_7),
                                                          np.int32(out_elems_8),
                                                          np.int32(mulx_9),
                                                          np.int32(muly_10),
                                                          destmem_0, srcmem_2)
            cl.enqueue_nd_range_kernel(self.queue,
                                       self.map_transpose_i32_low_width_var,
                                       ((np.long(squot32(((x_elems_5 + np.int32(16)) - np.int32(1)),
                                                         np.int32(16))) * np.long(np.int32(16))),
                                        (np.long(squot32(((squot32(((y_elems_6 + muly_10) - np.int32(1)),
                                                                   muly_10) + np.int32(16)) - np.int32(1)),
                                                         np.int32(16))) * np.long(np.int32(16))),
                                        (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                       (np.long(np.int32(16)),
                                        np.long(np.int32(16)),
                                        np.long(np.int32(1))))
            if synchronous:
              self.queue.finish()
        else:
          if (sle32(y_elems_6, np.int32(8)) and slt32(np.int32(16), x_elems_5)):
            if ((((1 * (np.long(squot32(((squot32(((x_elems_5 + mulx_9) - np.int32(1)),
                                                  mulx_9) + np.int32(16)) - np.int32(1)),
                                        np.int32(16))) * np.long(np.int32(16)))) * (np.long(squot32(((y_elems_6 + np.int32(16)) - np.int32(1)),
                                                                                                    np.int32(16))) * np.long(np.int32(16)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
              self.map_transpose_i32_low_height_var.set_args(np.int32(destoffset_1),
                                                             np.int32(srcoffset_3),
                                                             np.int32(num_arrays_4),
                                                             np.int32(x_elems_5),
                                                             np.int32(y_elems_6),
                                                             np.int32(in_elems_7),
                                                             np.int32(out_elems_8),
                                                             np.int32(mulx_9),
                                                             np.int32(muly_10),
                                                             destmem_0,
                                                             srcmem_2)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_transpose_i32_low_height_var,
                                         ((np.long(squot32(((squot32(((x_elems_5 + mulx_9) - np.int32(1)),
                                                                     mulx_9) + np.int32(16)) - np.int32(1)),
                                                           np.int32(16))) * np.long(np.int32(16))),
                                          (np.long(squot32(((y_elems_6 + np.int32(16)) - np.int32(1)),
                                                           np.int32(16))) * np.long(np.int32(16))),
                                          (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                         (np.long(np.int32(16)),
                                          np.long(np.int32(16)),
                                          np.long(np.int32(1))))
              if synchronous:
                self.queue.finish()
          else:
            if (sle32(x_elems_5, np.int32(8)) and sle32(y_elems_6,
                                                        np.int32(8))):
              if ((1 * (np.long(squot32(((((num_arrays_4 * x_elems_5) * y_elems_6) + np.int32(256)) - np.int32(1)),
                                        np.int32(256))) * np.long(np.int32(256)))) != 0):
                self.map_transpose_i32_small_var.set_args(np.int32(destoffset_1),
                                                          np.int32(srcoffset_3),
                                                          np.int32(num_arrays_4),
                                                          np.int32(x_elems_5),
                                                          np.int32(y_elems_6),
                                                          np.int32(in_elems_7),
                                                          np.int32(out_elems_8),
                                                          np.int32(mulx_9),
                                                          np.int32(muly_10),
                                                          destmem_0, srcmem_2)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.map_transpose_i32_small_var,
                                           ((np.long(squot32(((((num_arrays_4 * x_elems_5) * y_elems_6) + np.int32(256)) - np.int32(1)),
                                                             np.int32(256))) * np.long(np.int32(256))),),
                                           (np.long(np.int32(256)),))
                if synchronous:
                  self.queue.finish()
            else:
              if ((((1 * (np.long(squot32(((x_elems_5 + np.int32(32)) - np.int32(1)),
                                          np.int32(32))) * np.long(np.int32(32)))) * (np.long(squot32(((y_elems_6 + np.int32(32)) - np.int32(1)),
                                                                                                      np.int32(32))) * np.long(np.int32(8)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
                self.map_transpose_i32_var.set_args(np.int32(destoffset_1),
                                                    np.int32(srcoffset_3),
                                                    np.int32(num_arrays_4),
                                                    np.int32(x_elems_5),
                                                    np.int32(y_elems_6),
                                                    np.int32(in_elems_7),
                                                    np.int32(out_elems_8),
                                                    np.int32(mulx_9),
                                                    np.int32(muly_10),
                                                    destmem_0, srcmem_2)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.map_transpose_i32_var,
                                           ((np.long(squot32(((x_elems_5 + np.int32(32)) - np.int32(1)),
                                                             np.int32(32))) * np.long(np.int32(32))),
                                            (np.long(squot32(((y_elems_6 + np.int32(32)) - np.int32(1)),
                                                             np.int32(32))) * np.long(np.int32(8))),
                                            (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                           (np.long(np.int32(32)),
                                            np.long(np.int32(8)),
                                            np.long(np.int32(1))))
                if synchronous:
                  self.queue.finish()
    return ()
  def futhark_remove_nans(self, images_mem_37893, sizze_31200, sizze_31201,
                          sizze_31202, nan_value_31203):
    nesting_sizze_31969 = (sizze_31201 * sizze_31202)
    nesting_sizze_31970 = (sizze_31200 * nesting_sizze_31969)
    group_sizze_31971 = self.sizes["remove_nans.group_size_31908"]
    y_31972 = (group_sizze_31971 - np.int32(1))
    x_31973 = (nesting_sizze_31970 + y_31972)
    num_groups_31974 = squot32(x_31973, group_sizze_31971)
    num_threads_31975 = (group_sizze_31971 * num_groups_31974)
    binop_x_37895 = (sizze_31200 * sizze_31201)
    convop_x_37896 = (sizze_31202 * binop_x_37895)
    binop_x_37897 = sext_i32_i64(convop_x_37896)
    bytes_37894 = (np.int64(4) * binop_x_37897)
    mem_37898 = opencl_alloc(self, bytes_37894, "mem_37898")
    if ((1 * (np.long(num_groups_31974) * np.long(group_sizze_31971))) != 0):
      self.map_31914_var.set_args(np.int32(sizze_31200), np.int32(sizze_31201),
                                  np.int32(sizze_31202),
                                  np.int16(nan_value_31203), images_mem_37893,
                                  mem_37898)
      cl.enqueue_nd_range_kernel(self.queue, self.map_31914_var,
                                 ((np.long(num_groups_31974) * np.long(group_sizze_31971)),),
                                 (np.long(group_sizze_31971),))
      if synchronous:
        self.queue.finish()
    out_arrsizze_38548 = sizze_31200
    out_arrsizze_38549 = sizze_31201
    out_arrsizze_38550 = sizze_31202
    out_mem_38547 = mem_37898
    return (out_mem_38547, out_arrsizze_38548, out_arrsizze_38549,
            out_arrsizze_38550)
  def futhark_reshapeTransp(self, images_mem_37893, sizze_31193, sizze_31194,
                            sizze_31195):
    flat_dim_31197 = (sizze_31194 * sizze_31195)
    convop_x_37895 = (sizze_31193 * flat_dim_31197)
    binop_x_37896 = sext_i32_i64(convop_x_37895)
    bytes_37894 = (np.int64(4) * binop_x_37896)
    mem_37897 = opencl_alloc(self, bytes_37894, "mem_37897")
    self.futhark__map_transpose_f32(mem_37897, np.int32(0), images_mem_37893,
                                    np.int32(0), np.int32(1), flat_dim_31197,
                                    sizze_31193, (flat_dim_31197 * sizze_31193),
                                    (flat_dim_31197 * sizze_31193))
    out_arrsizze_38545 = flat_dim_31197
    out_arrsizze_38546 = sizze_31193
    out_mem_38544 = mem_37897
    return (out_mem_38544, out_arrsizze_38545, out_arrsizze_38546)
  def futhark__map_transpose_f32(self, destmem_0, destoffset_1, srcmem_2,
                                 srcoffset_3, num_arrays_4, x_elems_5,
                                 y_elems_6, in_elems_7, out_elems_8):
    if ((num_arrays_4 == np.int32(0)) or ((x_elems_5 == np.int32(0)) or (y_elems_6 == np.int32(0)))):
      pass
    else:
      muly_10 = squot32(np.int32(16), x_elems_5)
      mulx_9 = squot32(np.int32(16), y_elems_6)
      if ((in_elems_7 == out_elems_8) and (((num_arrays_4 == np.int32(1)) or ((x_elems_5 * y_elems_6) == in_elems_7)) and ((x_elems_5 == np.int32(1)) or (y_elems_6 == np.int32(1))))):
        if ((in_elems_7 * np.int32(4)) != 0):
          cl.enqueue_copy(self.queue, destmem_0, srcmem_2,
                          dest_offset=np.long(destoffset_1),
                          src_offset=np.long(srcoffset_3),
                          byte_count=np.long((in_elems_7 * np.int32(4))))
        if synchronous:
          self.queue.finish()
      else:
        if (sle32(x_elems_5, np.int32(8)) and slt32(np.int32(16), y_elems_6)):
          if ((((1 * (np.long(squot32(((x_elems_5 + np.int32(16)) - np.int32(1)),
                                      np.int32(16))) * np.long(np.int32(16)))) * (np.long(squot32(((squot32(((y_elems_6 + muly_10) - np.int32(1)),
                                                                                                            muly_10) + np.int32(16)) - np.int32(1)),
                                                                                                  np.int32(16))) * np.long(np.int32(16)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
            self.map_transpose_f32_low_width_var.set_args(np.int32(destoffset_1),
                                                          np.int32(srcoffset_3),
                                                          np.int32(num_arrays_4),
                                                          np.int32(x_elems_5),
                                                          np.int32(y_elems_6),
                                                          np.int32(in_elems_7),
                                                          np.int32(out_elems_8),
                                                          np.int32(mulx_9),
                                                          np.int32(muly_10),
                                                          destmem_0, srcmem_2)
            cl.enqueue_nd_range_kernel(self.queue,
                                       self.map_transpose_f32_low_width_var,
                                       ((np.long(squot32(((x_elems_5 + np.int32(16)) - np.int32(1)),
                                                         np.int32(16))) * np.long(np.int32(16))),
                                        (np.long(squot32(((squot32(((y_elems_6 + muly_10) - np.int32(1)),
                                                                   muly_10) + np.int32(16)) - np.int32(1)),
                                                         np.int32(16))) * np.long(np.int32(16))),
                                        (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                       (np.long(np.int32(16)),
                                        np.long(np.int32(16)),
                                        np.long(np.int32(1))))
            if synchronous:
              self.queue.finish()
        else:
          if (sle32(y_elems_6, np.int32(8)) and slt32(np.int32(16), x_elems_5)):
            if ((((1 * (np.long(squot32(((squot32(((x_elems_5 + mulx_9) - np.int32(1)),
                                                  mulx_9) + np.int32(16)) - np.int32(1)),
                                        np.int32(16))) * np.long(np.int32(16)))) * (np.long(squot32(((y_elems_6 + np.int32(16)) - np.int32(1)),
                                                                                                    np.int32(16))) * np.long(np.int32(16)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
              self.map_transpose_f32_low_height_var.set_args(np.int32(destoffset_1),
                                                             np.int32(srcoffset_3),
                                                             np.int32(num_arrays_4),
                                                             np.int32(x_elems_5),
                                                             np.int32(y_elems_6),
                                                             np.int32(in_elems_7),
                                                             np.int32(out_elems_8),
                                                             np.int32(mulx_9),
                                                             np.int32(muly_10),
                                                             destmem_0,
                                                             srcmem_2)
              cl.enqueue_nd_range_kernel(self.queue,
                                         self.map_transpose_f32_low_height_var,
                                         ((np.long(squot32(((squot32(((x_elems_5 + mulx_9) - np.int32(1)),
                                                                     mulx_9) + np.int32(16)) - np.int32(1)),
                                                           np.int32(16))) * np.long(np.int32(16))),
                                          (np.long(squot32(((y_elems_6 + np.int32(16)) - np.int32(1)),
                                                           np.int32(16))) * np.long(np.int32(16))),
                                          (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                         (np.long(np.int32(16)),
                                          np.long(np.int32(16)),
                                          np.long(np.int32(1))))
              if synchronous:
                self.queue.finish()
          else:
            if (sle32(x_elems_5, np.int32(8)) and sle32(y_elems_6,
                                                        np.int32(8))):
              if ((1 * (np.long(squot32(((((num_arrays_4 * x_elems_5) * y_elems_6) + np.int32(256)) - np.int32(1)),
                                        np.int32(256))) * np.long(np.int32(256)))) != 0):
                self.map_transpose_f32_small_var.set_args(np.int32(destoffset_1),
                                                          np.int32(srcoffset_3),
                                                          np.int32(num_arrays_4),
                                                          np.int32(x_elems_5),
                                                          np.int32(y_elems_6),
                                                          np.int32(in_elems_7),
                                                          np.int32(out_elems_8),
                                                          np.int32(mulx_9),
                                                          np.int32(muly_10),
                                                          destmem_0, srcmem_2)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.map_transpose_f32_small_var,
                                           ((np.long(squot32(((((num_arrays_4 * x_elems_5) * y_elems_6) + np.int32(256)) - np.int32(1)),
                                                             np.int32(256))) * np.long(np.int32(256))),),
                                           (np.long(np.int32(256)),))
                if synchronous:
                  self.queue.finish()
            else:
              if ((((1 * (np.long(squot32(((x_elems_5 + np.int32(32)) - np.int32(1)),
                                          np.int32(32))) * np.long(np.int32(32)))) * (np.long(squot32(((y_elems_6 + np.int32(32)) - np.int32(1)),
                                                                                                      np.int32(32))) * np.long(np.int32(8)))) * (np.long(num_arrays_4) * np.long(np.int32(1)))) != 0):
                self.map_transpose_f32_var.set_args(np.int32(destoffset_1),
                                                    np.int32(srcoffset_3),
                                                    np.int32(num_arrays_4),
                                                    np.int32(x_elems_5),
                                                    np.int32(y_elems_6),
                                                    np.int32(in_elems_7),
                                                    np.int32(out_elems_8),
                                                    np.int32(mulx_9),
                                                    np.int32(muly_10),
                                                    destmem_0, srcmem_2)
                cl.enqueue_nd_range_kernel(self.queue,
                                           self.map_transpose_f32_var,
                                           ((np.long(squot32(((x_elems_5 + np.int32(32)) - np.int32(1)),
                                                             np.int32(32))) * np.long(np.int32(32))),
                                            (np.long(squot32(((y_elems_6 + np.int32(32)) - np.int32(1)),
                                                             np.int32(32))) * np.long(np.int32(8))),
                                            (np.long(num_arrays_4) * np.long(np.int32(1)))),
                                           (np.long(np.int32(32)),
                                            np.long(np.int32(8)),
                                            np.long(np.int32(1))))
                if synchronous:
                  self.queue.finish()
    return ()
  def main(self, trend_31217_ext, k_31218_ext, n_31219_ext, freq_31220_ext,
           hfrac_31221_ext, lam_31222_ext, mappingindices_mem_37893_ext,
           images_mem_37894_ext):
    try:
      trend_31217 = np.int32(ct.c_int32(trend_31217_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #0 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("i32",
                                                                                                                            type(trend_31217_ext),
                                                                                                                            trend_31217_ext))
    try:
      k_31218 = np.int32(ct.c_int32(k_31218_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #1 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("i32",
                                                                                                                            type(k_31218_ext),
                                                                                                                            k_31218_ext))
    try:
      n_31219 = np.int32(ct.c_int32(n_31219_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #2 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("i32",
                                                                                                                            type(n_31219_ext),
                                                                                                                            n_31219_ext))
    try:
      freq_31220 = np.float32(ct.c_float(freq_31220_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #3 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("f32",
                                                                                                                            type(freq_31220_ext),
                                                                                                                            freq_31220_ext))
    try:
      hfrac_31221 = np.float32(ct.c_float(hfrac_31221_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #4 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("f32",
                                                                                                                            type(hfrac_31221_ext),
                                                                                                                            hfrac_31221_ext))
    try:
      lam_31222 = np.float32(ct.c_float(lam_31222_ext))
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #5 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("f32",
                                                                                                                            type(lam_31222_ext),
                                                                                                                            lam_31222_ext))
    try:
      assert ((type(mappingindices_mem_37893_ext) in [np.ndarray,
                                                      cl.array.Array]) and (mappingindices_mem_37893_ext.dtype == np.int32)), "Parameter has unexpected type"
      sizze_31214 = np.int32(mappingindices_mem_37893_ext.shape[0])
      if (type(mappingindices_mem_37893_ext) == cl.array.Array):
        mappingindices_mem_37893 = mappingindices_mem_37893_ext.data
      else:
        mappingindices_mem_37893 = opencl_alloc(self,
                                                np.int64(mappingindices_mem_37893_ext.nbytes),
                                                "mappingindices_mem_37893")
        if (np.int64(mappingindices_mem_37893_ext.nbytes) != 0):
          cl.enqueue_copy(self.queue, mappingindices_mem_37893,
                          normaliseArray(mappingindices_mem_37893_ext),
                          is_blocking=synchronous)
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #6 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("[]i32",
                                                                                                                            type(mappingindices_mem_37893_ext),
                                                                                                                            mappingindices_mem_37893_ext))
    try:
      assert ((type(images_mem_37894_ext) in [np.ndarray,
                                              cl.array.Array]) and (images_mem_37894_ext.dtype == np.float32)), "Parameter has unexpected type"
      sizze_31215 = np.int32(images_mem_37894_ext.shape[0])
      sizze_31216 = np.int32(images_mem_37894_ext.shape[1])
      if (type(images_mem_37894_ext) == cl.array.Array):
        images_mem_37894 = images_mem_37894_ext.data
      else:
        images_mem_37894 = opencl_alloc(self,
                                        np.int64(images_mem_37894_ext.nbytes),
                                        "images_mem_37894")
        if (np.int64(images_mem_37894_ext.nbytes) != 0):
          cl.enqueue_copy(self.queue, images_mem_37894,
                          normaliseArray(images_mem_37894_ext),
                          is_blocking=synchronous)
    except (TypeError, AssertionError) as e:
      raise TypeError("Argument #7 has invalid value\nFuthark type: {}\nArgument has Python type {} and value: {}\n".format("[][]f32",
                                                                                                                            type(images_mem_37894_ext),
                                                                                                                            images_mem_37894_ext))
    (out_mem_38553, out_arrsizze_38554, out_mem_38555, out_arrsizze_38556,
     out_mem_38557, out_arrsizze_38558, out_mem_38559, out_arrsizze_38560,
     out_mem_38561, out_arrsizze_38562, out_arrsizze_38563, out_mem_38564,
     out_arrsizze_38565, out_arrsizze_38566, out_mem_38567, out_arrsizze_38568,
     out_mem_38569, out_arrsizze_38570, out_mem_38571, out_arrsizze_38572,
     out_mem_38573, out_arrsizze_38574, out_arrsizze_38575, out_mem_38576,
     out_arrsizze_38577,
     out_arrsizze_38578) = self.futhark_main(mappingindices_mem_37893,
                                             images_mem_37894, sizze_31214,
                                             sizze_31215, sizze_31216,
                                             trend_31217, k_31218, n_31219,
                                             freq_31220, hfrac_31221, lam_31222)
    return (cl.array.Array(self.queue, (out_arrsizze_38554,), ct.c_float,
                           data=out_mem_38553), cl.array.Array(self.queue,
                                                               (out_arrsizze_38556,),
                                                               ct.c_int32,
                                                               data=out_mem_38555),
            cl.array.Array(self.queue, (out_arrsizze_38558,), ct.c_int32,
                           data=out_mem_38557), cl.array.Array(self.queue,
                                                               (out_arrsizze_38560,),
                                                               ct.c_float,
                                                               data=out_mem_38559),
            cl.array.Array(self.queue, (out_arrsizze_38562, out_arrsizze_38563),
                           ct.c_float, data=out_mem_38561),
            cl.array.Array(self.queue, (out_arrsizze_38565, out_arrsizze_38566),
                           ct.c_float, data=out_mem_38564),
            cl.array.Array(self.queue, (out_arrsizze_38568,), ct.c_float,
                           data=out_mem_38567), cl.array.Array(self.queue,
                                                               (out_arrsizze_38570,),
                                                               ct.c_int32,
                                                               data=out_mem_38569),
            cl.array.Array(self.queue, (out_arrsizze_38572,), ct.c_float,
                           data=out_mem_38571), cl.array.Array(self.queue,
                             