#line 1 "mkl_fft/src/mklfft.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*
 Copyright (c) 2017-2020, Intel Corporation

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright notice,
       this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
     * Neither the name of Intel Corporation nor the names of its contributors
       may be used to endorse or promote products derived from this software
       without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "Python.h"
#include "numpy/arrayobject.h"
#include "mklfft.h"
#include "multi_iter.h"
#include "mkl.h"
#include <limits.h>

#ifdef DEBUG
#include <assert.h>
#include <stdio.h>
#define _debug_print(...)  printf(__VA_ARGS__)
#else
#define _debug_print(...)
#endif

#define OK 0
#define ERROR -1

#define YES 1
#define NO  0

static NPY_INLINE MKL_LONG _to_mkl_long(npy_intp x)
{
    MKL_LONG lr = (MKL_LONG) x;
    assert( (npy_intp) lr == x);
    return lr;
}

NPY_INLINE char * mkl_dfti_error(MKL_LONG status)
{
    return DftiErrorMessage(status);
}

static npy_intp ar_size(npy_intp *sh, int len)
{
    npy_intp r = (len > 0) ? 1 : 0;
    npy_intp *d;
    int i;

    for(d=sh, i=0; i < len; i++, d++)
        r *= *d;

    return r;
}

static NPY_INLINE void get_basic_array_data(
    PyArrayObject* x,
    int *x_rank,
    npy_intp **x_shape,
    npy_intp **x_strides,
    npy_intp *x_itemsize,
    npy_intp *x_size)
{
    npy_intp asize = 0;
    assert(x != NULL);

    *x_rank = PyArray_NDIM(x);
    *x_shape = PyArray_SHAPE(x);
    *x_strides = PyArray_STRIDES(x);
    *x_itemsize = PyArray_ITEMSIZE(x);
    asize = ar_size(*x_shape, *x_rank);
    *x_size = asize;
}

/* ============================================================================= *
                  Routines for working with the cached FFT descriptor
 * ============================================================================= */

int _free_dfti_cache(DftiCache *dfti_cache) {
    if(dfti_cache->initialized && dfti_cache->hand) {
        MKL_LONG status = DftiFreeDescriptor(&(dfti_cache->hand));

        _debug_print("Descriptor freed: %ld\n", status);

        return (int) status;
    }

    return 0;
}

#line 115
static MKL_LONG
__create_descriptor_1d_DFTI_SINGLE_DFTI_COMPLEX(MKL_LONG len, float fsc, float bsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    if (dfti_cache->initialized && dfti_cache->hand) {
        enum DFTI_CONFIG_VALUE cached_prec, cached_dom;
        MKL_LONG cached_rank, cached_len;
        float cached_sc;

        status = DftiGetValue(dfti_cache->hand, DFTI_DIMENSION, &cached_rank);
        if (0 != status || cached_rank != 1) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_PRECISION, &cached_prec);
        if (0 != status || cached_prec != DFTI_SINGLE) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_DOMAIN, &cached_dom);
        if (0 != status || cached_dom != DFTI_COMPLEX) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_LENGTHS, &cached_len);
        if (0 != status || cached_len != len) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != fsc) goto set_scales;

        status = DftiGetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != bsc) goto set_scales;

        return status;
    }

  allocate_new:
    status =
        DftiCreateDescriptor(
            &(dfti_cache->hand),
            DFTI_SINGLE,
            DFTI_COMPLEX,
            1,
            len
        );
    dfti_cache->initialized = (status == 0) ? 1 : 0;

  set_scales:
    status = DftiSetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) return status;

    status = DftiSetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, bsc);

    return status;

  reallocate:
    if(dfti_cache->hand) {
        status = DftiFreeDescriptor(&(dfti_cache->hand));
        assert(status == 0);
    }

    status = 0;
    goto allocate_new;

}

#line 115
static MKL_LONG
__create_descriptor_1d_DFTI_SINGLE_DFTI_REAL(MKL_LONG len, float fsc, float bsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    if (dfti_cache->initialized && dfti_cache->hand) {
        enum DFTI_CONFIG_VALUE cached_prec, cached_dom;
        MKL_LONG cached_rank, cached_len;
        float cached_sc;

        status = DftiGetValue(dfti_cache->hand, DFTI_DIMENSION, &cached_rank);
        if (0 != status || cached_rank != 1) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_PRECISION, &cached_prec);
        if (0 != status || cached_prec != DFTI_SINGLE) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_DOMAIN, &cached_dom);
        if (0 != status || cached_dom != DFTI_REAL) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_LENGTHS, &cached_len);
        if (0 != status || cached_len != len) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != fsc) goto set_scales;

        status = DftiGetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != bsc) goto set_scales;

        return status;
    }

  allocate_new:
    status =
        DftiCreateDescriptor(
            &(dfti_cache->hand),
            DFTI_SINGLE,
            DFTI_REAL,
            1,
            len
        );
    dfti_cache->initialized = (status == 0) ? 1 : 0;

  set_scales:
    status = DftiSetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) return status;

    status = DftiSetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, bsc);

    return status;

  reallocate:
    if(dfti_cache->hand) {
        status = DftiFreeDescriptor(&(dfti_cache->hand));
        assert(status == 0);
    }

    status = 0;
    goto allocate_new;

}

#line 115
static MKL_LONG
__create_descriptor_1d_DFTI_DOUBLE_DFTI_COMPLEX(MKL_LONG len, double fsc, double bsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    if (dfti_cache->initialized && dfti_cache->hand) {
        enum DFTI_CONFIG_VALUE cached_prec, cached_dom;
        MKL_LONG cached_rank, cached_len;
        double cached_sc;

        status = DftiGetValue(dfti_cache->hand, DFTI_DIMENSION, &cached_rank);
        if (0 != status || cached_rank != 1) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_PRECISION, &cached_prec);
        if (0 != status || cached_prec != DFTI_DOUBLE) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_DOMAIN, &cached_dom);
        if (0 != status || cached_dom != DFTI_COMPLEX) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_LENGTHS, &cached_len);
        if (0 != status || cached_len != len) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != fsc) goto set_scales;

        status = DftiGetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != bsc) goto set_scales;

        return status;
    }

  allocate_new:
    status =
        DftiCreateDescriptor(
            &(dfti_cache->hand),
            DFTI_DOUBLE,
            DFTI_COMPLEX,
            1,
            len
        );
    dfti_cache->initialized = (status == 0) ? 1 : 0;

  set_scales:
    status = DftiSetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) return status;

    status = DftiSetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, bsc);

    return status;

  reallocate:
    if(dfti_cache->hand) {
        status = DftiFreeDescriptor(&(dfti_cache->hand));
        assert(status == 0);
    }

    status = 0;
    goto allocate_new;

}

#line 115
static MKL_LONG
__create_descriptor_1d_DFTI_DOUBLE_DFTI_REAL(MKL_LONG len, double fsc, double bsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    if (dfti_cache->initialized && dfti_cache->hand) {
        enum DFTI_CONFIG_VALUE cached_prec, cached_dom;
        MKL_LONG cached_rank, cached_len;
        double cached_sc;

        status = DftiGetValue(dfti_cache->hand, DFTI_DIMENSION, &cached_rank);
        if (0 != status || cached_rank != 1) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_PRECISION, &cached_prec);
        if (0 != status || cached_prec != DFTI_DOUBLE) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_DOMAIN, &cached_dom);
        if (0 != status || cached_dom != DFTI_REAL) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_LENGTHS, &cached_len);
        if (0 != status || cached_len != len) goto reallocate;

        status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != fsc) goto set_scales;

        status = DftiGetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, &cached_sc);
        if (0 != status || cached_sc != bsc) goto set_scales;

        return status;
    }

  allocate_new:
    status =
        DftiCreateDescriptor(
            &(dfti_cache->hand),
            DFTI_DOUBLE,
            DFTI_REAL,
            1,
            len
        );
    dfti_cache->initialized = (status == 0) ? 1 : 0;

  set_scales:
    status = DftiSetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) return status;

    status = DftiSetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, bsc);

    return status;

  reallocate:
    if(dfti_cache->hand) {
        status = DftiFreeDescriptor(&(dfti_cache->hand));
        assert(status == 0);
    }

    status = 0;
    goto allocate_new;

}


static NPY_INLINE int
__longp_equal_1d(MKL_LONG* a, MKL_LONG* b)
{
    return a != NULL && b != NULL &&
           0 == memcmp((const void *)a, (const void *)b, 2*sizeof(MKL_LONG));
}

static MKL_LONG
__set_descriptor_1d_value_longp(enum DFTI_CONFIG_PARAM par, MKL_LONG *val, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;
    MKL_LONG cached_val[2] = {0,0};

    assert(dfti_cache->initialized && dfti_cache->hand);

    status = DftiGetValue(dfti_cache->hand, par, cached_val);
    if (0 == status && __longp_equal_1d(cached_val, val))
        return status;

    status = DftiSetValue(dfti_cache->hand, par, val);

    return status;
}

#line 205
static MKL_LONG
__set_descriptor_1d_value_long(enum DFTI_CONFIG_PARAM par, MKL_LONG val, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;
    MKL_LONG cached_val;

    assert(dfti_cache->initialized && dfti_cache->hand);

    status = DftiGetValue(dfti_cache->hand, par, &cached_val);
    if (0 == status && cached_val == val)
        return status;

    status = DftiSetValue(dfti_cache->hand, par, val);

    return status;
}

#line 205
static MKL_LONG
__set_descriptor_1d_value_enum(enum DFTI_CONFIG_PARAM par, enum DFTI_CONFIG_VALUE val, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;
    enum DFTI_CONFIG_VALUE cached_val;

    assert(dfti_cache->initialized && dfti_cache->hand);

    status = DftiGetValue(dfti_cache->hand, par, &cached_val);
    if (0 == status && cached_val == val)
        return status;

    status = DftiSetValue(dfti_cache->hand, par, val);

    return status;
}


static MKL_LONG
__commit_descriptor_1d(DftiCache *dfti_cache)
{
    MKL_LONG status = 0;
    enum DFTI_CONFIG_VALUE cached_committed;

    assert(dfti_cache->initialized && dfti_cache->hand);
    status = DftiGetValue(dfti_cache->hand, DFTI_COMMIT_STATUS, &cached_committed);

    if(0 == status && cached_committed == DFTI_COMMITTED)
        return status;

    status = DftiCommitDescriptor(dfti_cache->hand);

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeForward_MKL_Complex16(MKL_Complex16 *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeForward_MKL_Complex8(MKL_Complex8 *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeForward_double(double *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeForward_float(float *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeBackward_MKL_Complex16(MKL_Complex16 *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeBackward_MKL_Complex8(MKL_Complex8 *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeBackward_double(double *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}

#line 244
static NPY_INLINE MKL_LONG
__cached_inplace_DftiComputeBackward_float(float *x, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x);
    Py_END_ALLOW_THREADS

    return status;
}


#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_float_float(
    float *x_in, float *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_float_MKL_Complex8(
    float *x_in, MKL_Complex8 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_float_float(
    float *x_in, float *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_float_MKL_Complex8(
    float *x_in, MKL_Complex8 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_MKL_Complex8_float(
    MKL_Complex8 *x_in, float *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_MKL_Complex8_MKL_Complex8(
    MKL_Complex8 *x_in, MKL_Complex8 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_MKL_Complex8_float(
    MKL_Complex8 *x_in, float *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_MKL_Complex8_MKL_Complex8(
    MKL_Complex8 *x_in, MKL_Complex8 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_double_double(
    double *x_in, double *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_double_MKL_Complex16(
    double *x_in, MKL_Complex16 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_double_double(
    double *x_in, double *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_double_MKL_Complex16(
    double *x_in, MKL_Complex16 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_MKL_Complex16_double(
    MKL_Complex16 *x_in, double *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeBackward_MKL_Complex16_MKL_Complex16(
    MKL_Complex16 *x_in, MKL_Complex16 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeBackward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_MKL_Complex16_double(
    MKL_Complex16 *x_in, double *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}

#line 263
static NPY_INLINE MKL_LONG
__cached_notinplace_DftiComputeForward_MKL_Complex16_MKL_Complex16(
    MKL_Complex16 *x_in, MKL_Complex16 *x_out, DftiCache *dfti_cache)
{
    MKL_LONG status = 0;

    assert(dfti_cache->initialized && dfti_cache->hand);
    Py_BEGIN_ALLOW_THREADS
    status = DftiComputeForward(dfti_cache->hand, x_in, x_out);
    Py_END_ALLOW_THREADS

    return status;
}


/*
 * For an array, iterating though indexes [i1, i2], fixing others to zero,
 * and _assuming_ these pointers form a regular lattice, compute distance
 * between adjacent pointers on that lattice.
 *
 */
static NPY_INLINE MKL_LONG
compute_distance(npy_intp *x_strides, npy_intp *x_shape, npy_intp x_itemsize, int x_rank, int i1, int i2, int c_contig, int f_contig) {

    /* i1 equals (axis == 0), hence if (i1) {...} is equivalent to if (axis == 0) {...} */
    if (c_contig) {
	/* Assuming
              x_strides == (x_itemsize * prod(x_shape[k], 0 <k <= x_rank-1), ..., x_itemsize * x_shape[x_rank-1], x_itemsize)
        */
        return (i1) ? 1 : _to_mkl_long(x_shape[x_rank-1]);
    }

    if (f_contig) {
	/* Assuming
              x_strides == (x_itemsize, x_itemsize * x_shape[0], ..., x_itemsize * prod(x_shape[k], 0<=k < x_rank-1))
        */
        return (i1) ? _to_mkl_long(x_shape[0]) : 1;
    }

    /* Right now, this is dead branch of code, since compute_distance is only called if array is ONESEGMENT, 
     * i.e. either C or F contiguous.
     *     It is however preserved here, since in the future this may be called on a not any-contiguous array.
     */
    {

    npy_intp st1, st2;
    npy_intp sh1 = x_shape[i1], sh2 = x_shape[i2];
    npy_intp min_s;

    if (sh1 > 1 && sh2 > 1) {
        st1 = x_strides[i1];
        st2 = x_strides[i2];
        min_s = (st1 > st2) ? st2 : st1;

        return _to_mkl_long(min_s / x_itemsize);
    } else {
        int i;
        npy_intp max_s;
        max_s = x_itemsize;
        for(i=0; i < x_rank; i++) {
            if (x_shape[i] > 1) {
                if (max_s < x_strides[i]) max_s = x_strides[i];
            }
        }
        min_s = max_s;
        for(i=i1; i <= i2; i++) {
            if (x_shape[i] > 1) {
                if (min_s > x_strides[i]) min_s = x_strides[i];
            }
        }
    }

    return _to_mkl_long(min_s / x_itemsize);
    }
}

static NPY_INLINE int
compute_strides_and_distances(
    PyArrayObject *x,
    int x_rank, npy_intp *x_shape, npy_intp *x_strides,
    npy_intp x_itemsize, npy_intp x_size, int axis,
    MKL_LONG *num_fft_transfs,
    MKL_LONG *vec_dist)
{
    int single_DftiCompute = NO;

    /* compute strides and distances*/
    switch(x_rank) {
    case 1:
        single_DftiCompute = YES;
        *num_fft_transfs = 1;
        break;
    case 2:
        single_DftiCompute = YES;
        *num_fft_transfs = _to_mkl_long (x_size / x_shape[axis]);
        *vec_dist = _to_mkl_long ( x_strides[1-axis] / x_itemsize );
        break;
    default:
        /* we must have C- or F- contiguous layout */
        /* TODO: enhance the test to acommodate equal size slices along all axis of a contiguous array,  *
         *       for example x = contig_r3_arr[::4, ::4, ::4]                                            */
        {
            int any_contig = (PyArray_ISONESEGMENT(x)) ? 1 : 0;
            single_DftiCompute =
                (any_contig && (axis == 0 || axis == x_rank-1)) ? YES : NO;
        }
        if (single_DftiCompute) {
            int i0 = (axis == 0);
            int c_contig = PyArray_IS_C_CONTIGUOUS(x);
            int f_contig = PyArray_IS_F_CONTIGUOUS(x);

            *num_fft_transfs = _to_mkl_long (x_size / x_shape[axis]);
            *vec_dist = compute_distance(x_strides, x_shape, x_itemsize, x_rank, i0, i0 + x_rank - 2, c_contig, f_contig);
        } else {
            /* input vectors need not be equidistant, checking if they are  *
             * may be expensive. Fall back on general iterations.           */

            /* TODO: input vectors may be equidistant for a subarray that   *
             * contains axis. For example any rank 2 subarray is, but maybe *
             * it is true for higher ranks.                                 */
            *num_fft_transfs = 1;
        }
    }

    return single_DftiCompute;
}

/* x: input array, y: output array */
static int
compute_strides_and_distances_inout(
    PyArrayObject *x, PyArrayObject *y,
    int x_rank, npy_intp *x_shape, npy_intp *x_strides,
    npy_intp x_itemsize, npy_intp x_size, int axis,
    MKL_LONG *num_fft_transfs,
    MKL_LONG *vec_dist_in,
    npy_intp *y_shape, npy_intp *y_strides, npy_intp y_itemsize,
    MKL_LONG *vec_dist_out
)
{
    int single_DftiCompute = NO;

    /* compute strides and distances*/
    switch(x_rank) {
    case 1:
        single_DftiCompute = YES;
        *num_fft_transfs = 1;
        break;
    case 2:
        single_DftiCompute = YES;
        *num_fft_transfs = _to_mkl_long (x_size / x_shape[axis]);
        *vec_dist_in = _to_mkl_long (x_strides[1-axis] / x_itemsize);
        *vec_dist_out = _to_mkl_long (y_strides[1-axis] / y_itemsize);
        break;
    default:
        assert(x_rank > 2);
        /* we must have C- or F- contiguous layout */
        {
            int any_contig = (PyArray_ISONESEGMENT(x)) ? 1 : 0;
            single_DftiCompute =
                (any_contig && (axis == 0 || axis == x_rank-1)) ? YES : NO;
        }
        if (single_DftiCompute) {
            int i0 = (axis == 0);
            int c_contig = PyArray_IS_C_CONTIGUOUS(x);
            int f_contig = PyArray_IS_F_CONTIGUOUS(x);

            *num_fft_transfs = _to_mkl_long (x_size / x_shape[axis]);
            *vec_dist_in = compute_distance(x_strides, x_shape, x_itemsize, x_rank, i0, i0 + x_rank - 2, c_contig, f_contig);

            c_contig = PyArray_IS_C_CONTIGUOUS(y);
            f_contig = PyArray_IS_F_CONTIGUOUS(y);
            *vec_dist_out = compute_distance(y_strides, y_shape, y_itemsize, x_rank, i0, i0 + x_rank - 2, c_contig, f_contig); 
        } else {
            /* input vectors need not be equidistant, checking if they are  *
             * may be expensive. Fall back on general iterations.           */

            /* TODO: input vectors may be equidistant for a subarray that   *
             * contains axis. For example any rank 2 subarray is, but may   *
             * it is true for higher ranks.                                 */
            *num_fft_transfs = 1;
        }
    }

    return single_DftiCompute;
}

/* ========================================================================= *
                    Routines for performing 1D fft
 * ========================================================================= */

#line 460
int cdouble_mkl_fft1d_in(PyArrayObject* x_inout, npy_intp n, int axis, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex16 *x_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    npy_intp *x_shape = 0, *x_strides = 0;
    int i, x_rank;
    int single_DftiCompute = NO;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);

    if (x_size == 0) /* nothing to do */
	return status;

    assert( x_size > 0 ); /* assert that x is non-empty */
    assert( 0 <= axis && axis < x_rank );
    assert( x_itemsize == sizeof(MKL_Complex16) );
    assert( 0 < n && n <= x_shape[axis] );

    x_data = (MKL_Complex16*) PyArray_DATA(x_inout);
    {
        char *tmp = (char *) x_data;
        input_strides[1] = ((MKL_Complex16*) (tmp + x_strides[axis])) - x_data;

        for(i=0; i < x_rank; i++) {
            assert( (x_strides[i] % x_itemsize) == 0 );
        }
        assert(input_strides[0] >= 0);
    }

    single_DftiCompute =
    compute_strides_and_distances(
        x_inout,
        x_rank, x_shape, x_strides, x_itemsize, x_size, axis,
        &input_number_of_transforms, &input_distance);

    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    /* these must be always set, since previous cached element
       may have had different values */
    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;


    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS,
                                                input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else { /* it is important to set the number of transforms
              for cached descriptor */
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_inplace_DftiComputeForward_MKL_Complex16(x_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((x_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < x_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, x_shape, x_rank, mask, x_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
                tmp += x_strides[i] * MultiIter_IndexElem(mit, i);

            status = __cached_inplace_DftiComputeForward_MKL_Complex16(
                (MKL_Complex16*) tmp, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
	mkl_free(mask);

        if (status != 0) goto failed;
    }


    return status;

  failed:

    return status;
}

#line 460
int cfloat_mkl_fft1d_in(PyArrayObject* x_inout, npy_intp n, int axis, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex8 *x_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    npy_intp *x_shape = 0, *x_strides = 0;
    int i, x_rank;
    int single_DftiCompute = NO;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);

    if (x_size == 0) /* nothing to do */
	return status;

    assert( x_size > 0 ); /* assert that x is non-empty */
    assert( 0 <= axis && axis < x_rank );
    assert( x_itemsize == sizeof(MKL_Complex8) );
    assert( 0 < n && n <= x_shape[axis] );

    x_data = (MKL_Complex8*) PyArray_DATA(x_inout);
    {
        char *tmp = (char *) x_data;
        input_strides[1] = ((MKL_Complex8*) (tmp + x_strides[axis])) - x_data;

        for(i=0; i < x_rank; i++) {
            assert( (x_strides[i] % x_itemsize) == 0 );
        }
        assert(input_strides[0] >= 0);
    }

    single_DftiCompute =
    compute_strides_and_distances(
        x_inout,
        x_rank, x_shape, x_strides, x_itemsize, x_size, axis,
        &input_number_of_transforms, &input_distance);

    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    /* these must be always set, since previous cached element
       may have had different values */
    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;


    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS,
                                                input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else { /* it is important to set the number of transforms
              for cached descriptor */
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_inplace_DftiComputeForward_MKL_Complex8(x_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((x_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < x_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, x_shape, x_rank, mask, x_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
                tmp += x_strides[i] * MultiIter_IndexElem(mit, i);

            status = __cached_inplace_DftiComputeForward_MKL_Complex8(
                (MKL_Complex8*) tmp, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
	mkl_free(mask);

        if (status != 0) goto failed;
    }


    return status;

  failed:

    return status;
}

#line 460
int cdouble_mkl_ifft1d_in(PyArrayObject* x_inout, npy_intp n, int axis, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex16 *x_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    npy_intp *x_shape = 0, *x_strides = 0;
    int i, x_rank;
    int single_DftiCompute = NO;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);

    if (x_size == 0) /* nothing to do */
	return status;

    assert( x_size > 0 ); /* assert that x is non-empty */
    assert( 0 <= axis && axis < x_rank );
    assert( x_itemsize == sizeof(MKL_Complex16) );
    assert( 0 < n && n <= x_shape[axis] );

    x_data = (MKL_Complex16*) PyArray_DATA(x_inout);
    {
        char *tmp = (char *) x_data;
        input_strides[1] = ((MKL_Complex16*) (tmp + x_strides[axis])) - x_data;

        for(i=0; i < x_rank; i++) {
            assert( (x_strides[i] % x_itemsize) == 0 );
        }
        assert(input_strides[0] >= 0);
    }

    single_DftiCompute =
    compute_strides_and_distances(
        x_inout,
        x_rank, x_shape, x_strides, x_itemsize, x_size, axis,
        &input_number_of_transforms, &input_distance);

    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    /* these must be always set, since previous cached element
       may have had different values */
    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;


    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS,
                                                input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else { /* it is important to set the number of transforms
              for cached descriptor */
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_inplace_DftiComputeBackward_MKL_Complex16(x_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((x_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < x_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, x_shape, x_rank, mask, x_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
                tmp += x_strides[i] * MultiIter_IndexElem(mit, i);

            status = __cached_inplace_DftiComputeBackward_MKL_Complex16(
                (MKL_Complex16*) tmp, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
	mkl_free(mask);

        if (status != 0) goto failed;
    }


    return status;

  failed:

    return status;
}

#line 460
int cfloat_mkl_ifft1d_in(PyArrayObject* x_inout, npy_intp n, int axis, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex8 *x_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    npy_intp *x_shape = 0, *x_strides = 0;
    int i, x_rank;
    int single_DftiCompute = NO;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);

    if (x_size == 0) /* nothing to do */
	return status;

    assert( x_size > 0 ); /* assert that x is non-empty */
    assert( 0 <= axis && axis < x_rank );
    assert( x_itemsize == sizeof(MKL_Complex8) );
    assert( 0 < n && n <= x_shape[axis] );

    x_data = (MKL_Complex8*) PyArray_DATA(x_inout);
    {
        char *tmp = (char *) x_data;
        input_strides[1] = ((MKL_Complex8*) (tmp + x_strides[axis])) - x_data;

        for(i=0; i < x_rank; i++) {
            assert( (x_strides[i] % x_itemsize) == 0 );
        }
        assert(input_strides[0] >= 0);
    }

    single_DftiCompute =
    compute_strides_and_distances(
        x_inout,
        x_rank, x_shape, x_strides, x_itemsize, x_size, axis,
        &input_number_of_transforms, &input_distance);

    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    /* these must be always set, since previous cached element
       may have had different values */
    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;


    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS,
                                                input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else { /* it is important to set the number of transforms
              for cached descriptor */
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_inplace_DftiComputeBackward_MKL_Complex8(x_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((x_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < x_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, x_shape, x_rank, mask, x_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp = (char *) x_data, i = 0; i < x_rank; i++)
                tmp += x_strides[i] * MultiIter_IndexElem(mit, i);

            status = __cached_inplace_DftiComputeBackward_MKL_Complex8(
                (MKL_Complex8*) tmp, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
	mkl_free(mask);

        if (status != 0) goto failed;
    }


    return status;

  failed:

    return status;
}


#define TRUE 1
#define FALSE 0

/* is there a nicer way to complex conjugate, perhaps in MKL headers ? */
#define SET_CONJ(mkl_complex_p_dest, mkl_complex_p_src) \
    { ((mkl_complex_p_dest)->real) = ((mkl_complex_p_src)->real); \
       ((mkl_complex_p_dest)->imag) = -((mkl_complex_p_src)->imag); }

#line 603
int float_cfloat_mkl_fft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out,
    int all_harmonics, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             output_distance = 0, input_number_of_transforms = 1;
    float *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0,
             *xin_strides = 0,
             *xout_shape = 0,
             *xout_strides = 0;
    int i, xin_rank, xout_rank;
    int single_DftiCompute = NO;
    npy_intp xin_size, xin_itemsize, xout_itemsize, xout_size;
    float forward_scale = 1.0, backward_scale = 1.0;

    assert(x_in != NULL);
    assert(x_out != NULL);

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape, &xout_strides,
                         &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xin_size == xout_size);
	return status;
    }
    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_itemsize == sizeof(float));
    assert(xout_itemsize = 2 * xin_itemsize);

    for(i=0; i < axis; i++)
        assert(xin_shape[i] == xout_shape[i]);
    assert( xout_shape[axis] == (all_harmonics) ? n : n/2 + 1);
    for(i=axis+1; i < xin_rank; i++)
        assert(xin_shape[i] == xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (float*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((float*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex8*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    if (FALSE) { /* we are doing IFFT using Forward computation, swap scales */
        forward_scale = 1.0/(fsc*n);
        backward_scale = fsc;
    } else {
        forward_scale = fsc;
        backward_scale = 1.0/(fsc*n);
    }
    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_REAL(
        _to_mkl_long(n), forward_scale, backward_scale, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_float_MKL_Complex8(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_float_MKL_Complex8( 
                (float*) tmp1, (MKL_Complex8*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }


    /* fill the rest of the output array */
    /* TODO: optimize for 1D and 2D */
    if (all_harmonics) {
        npy_intp n_last = xout_shape[axis];
        npy_intp nh_last = (n_last/2) + 1;

        if (xout_rank == 1) {
            npy_intp it, it_max = (n_last > 2) ? n_last: nh_last;
            npy_intp s = xout_strides[0] / xout_itemsize;

            /* TODO: parallelize with openmp ? */
            for(it = nh_last; it < it_max; it++) {
                MKL_Complex8
                    *src = xout_data + (n_last - it)*s,
                    *dest = xout_data + it*s;

                SET_CONJ(dest, src);
            }
/*
        } else if(single_DftiCompute) {
*/
        } else {
            multi_iter_t mit;
            npy_intp *half_shape;

            half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

            memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
            half_shape[axis] = (n_last > 2) ? n_last - nh_last: 0;

            multi_iter_new(&mit, half_shape, xout_rank);
	    mkl_free(half_shape);

            while(!MultiIter_Done(mit)) {
                char *tmp1, *tmp2;
                int i;
                MKL_Complex8 *dest, *src;
                /* npy_intp k_last = MultiIter_IndexElem(mit, axis); */

#if defined(__ICC) || defined(__INTEL_COMPILER)
                #pragma novector
#endif
                for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                    i = 0; i < xout_rank; i++) {
                    npy_intp si = xout_strides[i],
                             ni = xout_shape[i],
                             src_ki = MultiIter_IndexElem(mit, i),
                             dest_ki;

                    dest_ki = src_ki;
                    if (i == axis) {
                        dest_ki += nh_last;
                        assert(dest_ki > 0);
                        assert(dest_ki < ni);
                        src_ki = ni - dest_ki;
                    }

                    tmp1 += si * dest_ki;
                    tmp2 += si * src_ki;
                }
                dest = (MKL_Complex8*) tmp1;
                src  = (MKL_Complex8*) tmp2;

                SET_CONJ(dest, src);

                if (multi_iter_next(&mit))
                    break;
            }


            multi_iter_free(&mit);
        }
    }

    if (FALSE) {
       vmcConj(xout_size, xout_data, xout_data, VML_HA);
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 603
int double_cdouble_mkl_fft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out,
    int all_harmonics, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             output_distance = 0, input_number_of_transforms = 1;
    double *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0,
             *xin_strides = 0,
             *xout_shape = 0,
             *xout_strides = 0;
    int i, xin_rank, xout_rank;
    int single_DftiCompute = NO;
    npy_intp xin_size, xin_itemsize, xout_itemsize, xout_size;
    double forward_scale = 1.0, backward_scale = 1.0;

    assert(x_in != NULL);
    assert(x_out != NULL);

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape, &xout_strides,
                         &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xin_size == xout_size);
	return status;
    }
    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_itemsize == sizeof(double));
    assert(xout_itemsize = 2 * xin_itemsize);

    for(i=0; i < axis; i++)
        assert(xin_shape[i] == xout_shape[i]);
    assert( xout_shape[axis] == (all_harmonics) ? n : n/2 + 1);
    for(i=axis+1; i < xin_rank; i++)
        assert(xin_shape[i] == xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (double*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((double*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex16*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    if (FALSE) { /* we are doing IFFT using Forward computation, swap scales */
        forward_scale = 1.0/(fsc*n);
        backward_scale = fsc;
    } else {
        forward_scale = fsc;
        backward_scale = 1.0/(fsc*n);
    }
    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_REAL(
        _to_mkl_long(n), forward_scale, backward_scale, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_double_MKL_Complex16(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_double_MKL_Complex16( 
                (double*) tmp1, (MKL_Complex16*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }


    /* fill the rest of the output array */
    /* TODO: optimize for 1D and 2D */
    if (all_harmonics) {
        npy_intp n_last = xout_shape[axis];
        npy_intp nh_last = (n_last/2) + 1;

        if (xout_rank == 1) {
            npy_intp it, it_max = (n_last > 2) ? n_last: nh_last;
            npy_intp s = xout_strides[0] / xout_itemsize;

            /* TODO: parallelize with openmp ? */
            for(it = nh_last; it < it_max; it++) {
                MKL_Complex16
                    *src = xout_data + (n_last - it)*s,
                    *dest = xout_data + it*s;

                SET_CONJ(dest, src);
            }
/*
        } else if(single_DftiCompute) {
*/
        } else {
            multi_iter_t mit;
            npy_intp *half_shape;

            half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

            memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
            half_shape[axis] = (n_last > 2) ? n_last - nh_last: 0;

            multi_iter_new(&mit, half_shape, xout_rank);
	    mkl_free(half_shape);

            while(!MultiIter_Done(mit)) {
                char *tmp1, *tmp2;
                int i;
                MKL_Complex16 *dest, *src;
                /* npy_intp k_last = MultiIter_IndexElem(mit, axis); */

#if defined(__ICC) || defined(__INTEL_COMPILER)
                #pragma novector
#endif
                for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                    i = 0; i < xout_rank; i++) {
                    npy_intp si = xout_strides[i],
                             ni = xout_shape[i],
                             src_ki = MultiIter_IndexElem(mit, i),
                             dest_ki;

                    dest_ki = src_ki;
                    if (i == axis) {
                        dest_ki += nh_last;
                        assert(dest_ki > 0);
                        assert(dest_ki < ni);
                        src_ki = ni - dest_ki;
                    }

                    tmp1 += si * dest_ki;
                    tmp2 += si * src_ki;
                }
                dest = (MKL_Complex16*) tmp1;
                src  = (MKL_Complex16*) tmp2;

                SET_CONJ(dest, src);

                if (multi_iter_next(&mit))
                    break;
            }


            multi_iter_free(&mit);
        }
    }

    if (FALSE) {
       vmzConj(xout_size, xout_data, xout_data, VML_HA);
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 603
int float_cfloat_mkl_ifft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out,
    int all_harmonics, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             output_distance = 0, input_number_of_transforms = 1;
    float *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0,
             *xin_strides = 0,
             *xout_shape = 0,
             *xout_strides = 0;
    int i, xin_rank, xout_rank;
    int single_DftiCompute = NO;
    npy_intp xin_size, xin_itemsize, xout_itemsize, xout_size;
    float forward_scale = 1.0, backward_scale = 1.0;

    assert(x_in != NULL);
    assert(x_out != NULL);

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape, &xout_strides,
                         &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xin_size == xout_size);
	return status;
    }
    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_itemsize == sizeof(float));
    assert(xout_itemsize = 2 * xin_itemsize);

    for(i=0; i < axis; i++)
        assert(xin_shape[i] == xout_shape[i]);
    assert( xout_shape[axis] == (all_harmonics) ? n : n/2 + 1);
    for(i=axis+1; i < xin_rank; i++)
        assert(xin_shape[i] == xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (float*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((float*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex8*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    if (TRUE) { /* we are doing IFFT using Forward computation, swap scales */
        forward_scale = 1.0/(fsc*n);
        backward_scale = fsc;
    } else {
        forward_scale = fsc;
        backward_scale = 1.0/(fsc*n);
    }
    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_REAL(
        _to_mkl_long(n), forward_scale, backward_scale, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_float_MKL_Complex8(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_float_MKL_Complex8( 
                (float*) tmp1, (MKL_Complex8*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }


    /* fill the rest of the output array */
    /* TODO: optimize for 1D and 2D */
    if (all_harmonics) {
        npy_intp n_last = xout_shape[axis];
        npy_intp nh_last = (n_last/2) + 1;

        if (xout_rank == 1) {
            npy_intp it, it_max = (n_last > 2) ? n_last: nh_last;
            npy_intp s = xout_strides[0] / xout_itemsize;

            /* TODO: parallelize with openmp ? */
            for(it = nh_last; it < it_max; it++) {
                MKL_Complex8
                    *src = xout_data + (n_last - it)*s,
                    *dest = xout_data + it*s;

                SET_CONJ(dest, src);
            }
/*
        } else if(single_DftiCompute) {
*/
        } else {
            multi_iter_t mit;
            npy_intp *half_shape;

            half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

            memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
            half_shape[axis] = (n_last > 2) ? n_last - nh_last: 0;

            multi_iter_new(&mit, half_shape, xout_rank);
	    mkl_free(half_shape);

            while(!MultiIter_Done(mit)) {
                char *tmp1, *tmp2;
                int i;
                MKL_Complex8 *dest, *src;
                /* npy_intp k_last = MultiIter_IndexElem(mit, axis); */

#if defined(__ICC) || defined(__INTEL_COMPILER)
                #pragma novector
#endif
                for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                    i = 0; i < xout_rank; i++) {
                    npy_intp si = xout_strides[i],
                             ni = xout_shape[i],
                             src_ki = MultiIter_IndexElem(mit, i),
                             dest_ki;

                    dest_ki = src_ki;
                    if (i == axis) {
                        dest_ki += nh_last;
                        assert(dest_ki > 0);
                        assert(dest_ki < ni);
                        src_ki = ni - dest_ki;
                    }

                    tmp1 += si * dest_ki;
                    tmp2 += si * src_ki;
                }
                dest = (MKL_Complex8*) tmp1;
                src  = (MKL_Complex8*) tmp2;

                SET_CONJ(dest, src);

                if (multi_iter_next(&mit))
                    break;
            }


            multi_iter_free(&mit);
        }
    }

    if (TRUE) {
       vmcConj(xout_size, xout_data, xout_data, VML_HA);
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 603
int double_cdouble_mkl_ifft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out,
    int all_harmonics, double fsc, DftiCache* dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0,
             output_distance = 0, input_number_of_transforms = 1;
    double *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0,
             *xin_strides = 0,
             *xout_shape = 0,
             *xout_strides = 0;
    int i, xin_rank, xout_rank;
    int single_DftiCompute = NO;
    npy_intp xin_size, xin_itemsize, xout_itemsize, xout_size;
    double forward_scale = 1.0, backward_scale = 1.0;

    assert(x_in != NULL);
    assert(x_out != NULL);

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape, &xout_strides,
                         &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xin_size == xout_size);
	return status;
    }
    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_itemsize == sizeof(double));
    assert(xout_itemsize = 2 * xin_itemsize);

    for(i=0; i < axis; i++)
        assert(xin_shape[i] == xout_shape[i]);
    assert( xout_shape[axis] == (all_harmonics) ? n : n/2 + 1);
    for(i=axis+1; i < xin_rank; i++)
        assert(xin_shape[i] == xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (double*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((double*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex16*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    if (TRUE) { /* we are doing IFFT using Forward computation, swap scales */
        forward_scale = 1.0/(fsc*n);
        backward_scale = fsc;
    } else {
        forward_scale = fsc;
        backward_scale = 1.0/(fsc*n);
    }
    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_REAL(
        _to_mkl_long(n), forward_scale, backward_scale, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_double_MKL_Complex16(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_double_MKL_Complex16( 
                (double*) tmp1, (MKL_Complex16*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


	multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }


    /* fill the rest of the output array */
    /* TODO: optimize for 1D and 2D */
    if (all_harmonics) {
        npy_intp n_last = xout_shape[axis];
        npy_intp nh_last = (n_last/2) + 1;

        if (xout_rank == 1) {
            npy_intp it, it_max = (n_last > 2) ? n_last: nh_last;
            npy_intp s = xout_strides[0] / xout_itemsize;

            /* TODO: parallelize with openmp ? */
            for(it = nh_last; it < it_max; it++) {
                MKL_Complex16
                    *src = xout_data + (n_last - it)*s,
                    *dest = xout_data + it*s;

                SET_CONJ(dest, src);
            }
/*
        } else if(single_DftiCompute) {
*/
        } else {
            multi_iter_t mit;
            npy_intp *half_shape;

            half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

            memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
            half_shape[axis] = (n_last > 2) ? n_last - nh_last: 0;

            multi_iter_new(&mit, half_shape, xout_rank);
	    mkl_free(half_shape);

            while(!MultiIter_Done(mit)) {
                char *tmp1, *tmp2;
                int i;
                MKL_Complex16 *dest, *src;
                /* npy_intp k_last = MultiIter_IndexElem(mit, axis); */

#if defined(__ICC) || defined(__INTEL_COMPILER)
                #pragma novector
#endif
                for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                    i = 0; i < xout_rank; i++) {
                    npy_intp si = xout_strides[i],
                             ni = xout_shape[i],
                             src_ki = MultiIter_IndexElem(mit, i),
                             dest_ki;

                    dest_ki = src_ki;
                    if (i == axis) {
                        dest_ki += nh_last;
                        assert(dest_ki > 0);
                        assert(dest_ki < ni);
                        src_ki = ni - dest_ki;
                    }

                    tmp1 += si * dest_ki;
                    tmp2 += si * src_ki;
                }
                dest = (MKL_Complex16*) tmp1;
                src  = (MKL_Complex16*) tmp2;

                SET_CONJ(dest, src);

                if (multi_iter_next(&mit))
                    break;
            }


            multi_iter_free(&mit);
        }
    }

    if (TRUE) {
       vmzConj(xout_size, xout_data, xout_data, VML_HA);
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}


#line 875
int cfloat_cfloat_mkl_fft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex8 *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_itemsize == sizeof(MKL_Complex8));

    for(i=0; i < xin_rank; i++)
        assert( xin_shape[i] >= xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex8*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex8*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex8*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_MKL_Complex8_MKL_Complex8(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_MKL_Complex8_MKL_Complex8(
                (MKL_Complex8*) tmp1, (MKL_Complex8*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 875
int cdouble_cdouble_mkl_fft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex16 *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_itemsize == sizeof(MKL_Complex16));

    for(i=0; i < xin_rank; i++)
        assert( xin_shape[i] >= xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex16*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex16*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex16*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeForward_MKL_Complex16_MKL_Complex16(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeForward_MKL_Complex16_MKL_Complex16(
                (MKL_Complex16*) tmp1, (MKL_Complex16*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 875
int cfloat_cfloat_mkl_ifft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex8 *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_itemsize == sizeof(MKL_Complex8));

    for(i=0; i < xin_rank; i++)
        assert( xin_shape[i] >= xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex8*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex8*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex8*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeBackward_MKL_Complex8_MKL_Complex8(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeBackward_MKL_Complex8_MKL_Complex8(
                (MKL_Complex8*) tmp1, (MKL_Complex8*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 875
int cdouble_cdouble_mkl_ifft1d_out(
    PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex16 *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xin_shape[axis] );

    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_itemsize == sizeof(MKL_Complex16));

    for(i=0; i < xin_rank; i++)
        assert( xin_shape[i] >= xout_shape[i]);

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex16*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex16*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((MKL_Complex16*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_COMPLEX(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeBackward_MKL_Complex16_MKL_Complex16(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeBackward_MKL_Complex16_MKL_Complex16(
                (MKL_Complex16*) tmp1, (MKL_Complex16*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}


/* ====================  Real FFT ===================== *
     This is 1D FFT of real vector the produces a real
     vector with packed storage of conjugate even
     harmonics.
 * ==================================================== */

#line 1063
/* n here is the length of the output along the axis */
int
cfloat_float_mkl_irfft_out(
    PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject* x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex8 *xin_data = 0;
    float *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xout_shape[axis] );

    assert(xout_itemsize == sizeof(float));
    assert(xin_itemsize == sizeof(MKL_Complex8));

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex8*) PyArray_DATA(x_in);
    xout_data = (float*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex8*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((float*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_SINGLE_DFTI_REAL(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeBackward_MKL_Complex8_float(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeBackward_MKL_Complex8_float(
                (MKL_Complex8*) tmp1, (float*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}

#line 1063
/* n here is the length of the output along the axis */
int
cdouble_double_mkl_irfft_out(
    PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject* x_out, double fsc, DftiCache *dfti_cache)
{
    MKL_LONG status = 0, input_distance = 0, output_distance = 0,
             input_number_of_transforms = 1;
    MKL_Complex16 *xin_data = 0;
    double *xout_data = 0;
    /* For 1D transformes strides is a length-2 array */
    MKL_LONG input_strides[2] = {0, 1};
    MKL_LONG output_strides[2] = {0, 1};
    npy_intp *xin_shape = 0, *xin_strides = 0,
             *xout_shape = 0, *xout_strides = 0;
    int i, xin_rank = 0, xout_rank = 0;
    int single_DftiCompute = NO;
    npy_intp xin_size = 0, xin_itemsize = 0, xout_itemsize = 0, xout_size = 0;


    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    assert(xout_rank == xin_rank);

    if (xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert( xin_size > 0 ); /* assert that array is non-empty */
    assert(0 <= axis && axis < xin_rank);
    assert( 0 < n && n <= xout_shape[axis] );

    assert(xout_itemsize == sizeof(double));
    assert(xin_itemsize == sizeof(MKL_Complex16));

    /* output buffer is created in Cython as C_CONTIGUOUS */
    assert(PyArray_ISONESEGMENT(x_out));

    xin_data = (MKL_Complex16*) PyArray_DATA(x_in);
    xout_data = (double*) PyArray_DATA(x_out);
    {
        char *tmp = (char *) xin_data;
        input_strides[1] =
            ((MKL_Complex16*) (tmp + xin_strides[axis])) - xin_data;

        assert(input_strides[0] >= 0);

        for(i=0; i < xin_rank; i++) {
            assert( (xin_strides[i] % xin_itemsize) == 0 );
            assert( (xout_strides[i] % xout_itemsize) == 0 );
            assert( xout_strides[i] > 0 );
        }

        /* Compute output strides */
        tmp = (char *) xout_data;
        output_strides[1] =
            ((double*) (tmp + xout_strides[axis])) - xout_data;
    }

    single_DftiCompute =
        compute_strides_and_distances_inout(
            x_in, x_out,
            xin_rank, xin_shape, xin_strides,
            xin_itemsize, xin_size, axis,
            &input_number_of_transforms,
            &input_distance,
            xout_shape, xout_strides, xout_itemsize,
            &output_distance
        );

    status = __create_descriptor_1d_DFTI_DOUBLE_DFTI_REAL(
        _to_mkl_long(n), fsc, 1.0/(fsc*n), dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_INPUT_STRIDES, input_strides, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_enum(
        DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache);
    if (status != 0) goto failed;

    status = __set_descriptor_1d_value_longp(
        DFTI_OUTPUT_STRIDES, output_strides, dfti_cache);
    if (status != 0) goto failed;

    if (input_number_of_transforms > 1) {
        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_INPUT_DISTANCE, input_distance, dfti_cache);
        if (status != 0) goto failed;

        status = __set_descriptor_1d_value_long(
            DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache);
        if (status != 0) goto failed;
    }
    else {
        assert(input_number_of_transforms == 1);

        status = __set_descriptor_1d_value_long(
            DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache);
        if (status != 0) goto failed;
    }

    status = __commit_descriptor_1d(dfti_cache);
    if (status != 0) goto failed;

    if (single_DftiCompute){
        status = __cached_notinplace_DftiComputeBackward_MKL_Complex16_double(
            xin_data, xout_data, dfti_cache);
        if (status != 0) goto failed;
    } else {
        multi_iter_masked_t mit;
        int *mask;
        int i;

        mask = (int *) mkl_malloc((xin_rank-1)*sizeof(int), 64);

        for(i = 0; i < axis; i++) mask[i] = i;
        for(i = axis + 1; i < xin_rank; i++) mask[i-1] = i;

        multi_iter_masked_new(&mit, xin_shape, xin_rank, mask, xin_rank - 1);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xin_data,
                tmp2 = (char *) xout_data,
                i = 0; i < xin_rank; i++) {
                tmp1 += xin_strides[i] * MultiIter_IndexElem(mit, i);
                tmp2 += xout_strides[i] * MultiIter_IndexElem(mit, i);
            }

            status = __cached_notinplace_DftiComputeBackward_MKL_Complex16_double(
                (MKL_Complex16*) tmp1, (double*) tmp2, dfti_cache);
            if (status != 0) break;

            if (multi_iter_masked_next(&mit))
                break;
        }


        multi_iter_masked_free(&mit);
        mkl_free(mask);

        if (status != 0) goto failed;
    }

    status = OK;

  cleanup:

    return status;

  failed:

    goto cleanup;
}


#line 1246
int
cfloat_cfloat_mkl_fftnd_in(PyArrayObject *x_inout, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex8 *x_data = 0;
    MKL_LONG *input_strides = 0;
    npy_intp *x_shape = 0, *x_strides = 0;
    MKL_LONG *x_shape_mkl = 0;
    int i, x_rank;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);


    if (x_size == 0) /* nothing to do */
	return status;

    assert(x_size > 0);
    assert(x_itemsize == sizeof(MKL_Complex8));

    if (x_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                x_rank,
                _to_mkl_long(x_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            x_shape_mkl = (MKL_LONG*)x_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            x_shape_mkl = (MKL_LONG *) mkl_malloc(x_rank*sizeof(MKL_LONG), 64);
            if (!x_shape_mkl) goto cleanup;
            for(i = 0; i < x_rank; i++)
                x_shape_mkl[i] = _to_mkl_long(x_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                x_rank,
                x_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * x_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((x_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    input_strides[0] = 0;
    for(i = 0; i < x_rank; i++) {
        assert( x_strides[i] % x_itemsize == 0 );
        input_strides[i + 1] = _to_mkl_long (x_strides[i] / x_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    x_data = (MKL_Complex8*) PyArray_DATA(x_inout);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, x_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(x_shape_mkl != (MKL_LONG*) x_shape) mkl_free(x_shape_mkl);

    return status;

}

#line 1246
int
cdouble_cdouble_mkl_fftnd_in(PyArrayObject *x_inout, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex16 *x_data = 0;
    MKL_LONG *input_strides = 0;
    npy_intp *x_shape = 0, *x_strides = 0;
    MKL_LONG *x_shape_mkl = 0;
    int i, x_rank;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);


    if (x_size == 0) /* nothing to do */
	return status;

    assert(x_size > 0);
    assert(x_itemsize == sizeof(MKL_Complex16));

    if (x_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                x_rank,
                _to_mkl_long(x_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            x_shape_mkl = (MKL_LONG*)x_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            x_shape_mkl = (MKL_LONG *) mkl_malloc(x_rank*sizeof(MKL_LONG), 64);
            if (!x_shape_mkl) goto cleanup;
            for(i = 0; i < x_rank; i++)
                x_shape_mkl[i] = _to_mkl_long(x_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                x_rank,
                x_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * x_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((x_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    input_strides[0] = 0;
    for(i = 0; i < x_rank; i++) {
        assert( x_strides[i] % x_itemsize == 0 );
        input_strides[i + 1] = _to_mkl_long (x_strides[i] / x_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    x_data = (MKL_Complex16*) PyArray_DATA(x_inout);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, x_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(x_shape_mkl != (MKL_LONG*) x_shape) mkl_free(x_shape_mkl);

    return status;

}

#line 1246
int
cfloat_cfloat_mkl_ifftnd_in(PyArrayObject *x_inout, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex8 *x_data = 0;
    MKL_LONG *input_strides = 0;
    npy_intp *x_shape = 0, *x_strides = 0;
    MKL_LONG *x_shape_mkl = 0;
    int i, x_rank;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);


    if (x_size == 0) /* nothing to do */
	return status;

    assert(x_size > 0);
    assert(x_itemsize == sizeof(MKL_Complex8));

    if (x_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                x_rank,
                _to_mkl_long(x_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            x_shape_mkl = (MKL_LONG*)x_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            x_shape_mkl = (MKL_LONG *) mkl_malloc(x_rank*sizeof(MKL_LONG), 64);
            if (!x_shape_mkl) goto cleanup;
            for(i = 0; i < x_rank; i++)
                x_shape_mkl[i] = _to_mkl_long(x_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                x_rank,
                x_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * x_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((x_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    input_strides[0] = 0;
    for(i = 0; i < x_rank; i++) {
        assert( x_strides[i] % x_itemsize == 0 );
        input_strides[i + 1] = _to_mkl_long (x_strides[i] / x_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    x_data = (MKL_Complex8*) PyArray_DATA(x_inout);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeBackward(hand, x_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(x_shape_mkl != (MKL_LONG*) x_shape) mkl_free(x_shape_mkl);

    return status;

}

#line 1246
int
cdouble_cdouble_mkl_ifftnd_in(PyArrayObject *x_inout, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex16 *x_data = 0;
    MKL_LONG *input_strides = 0;
    npy_intp *x_shape = 0, *x_strides = 0;
    MKL_LONG *x_shape_mkl = 0;
    int i, x_rank;
    npy_intp x_size, x_itemsize;

    get_basic_array_data(x_inout, &x_rank, &x_shape,
                         &x_strides, &x_itemsize, &x_size);


    if (x_size == 0) /* nothing to do */
	return status;

    assert(x_size > 0);
    assert(x_itemsize == sizeof(MKL_Complex16));

    if (x_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                x_rank,
                _to_mkl_long(x_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            x_shape_mkl = (MKL_LONG*)x_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            x_shape_mkl = (MKL_LONG *) mkl_malloc(x_rank*sizeof(MKL_LONG), 64);
            if (!x_shape_mkl) goto cleanup;
            for(i = 0; i < x_rank; i++)
                x_shape_mkl[i] = _to_mkl_long(x_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                x_rank,
                x_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * x_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((x_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    input_strides[0] = 0;
    for(i = 0; i < x_rank; i++) {
        assert( x_strides[i] % x_itemsize == 0 );
        input_strides[i + 1] = _to_mkl_long (x_strides[i] / x_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    x_data = (MKL_Complex16*) PyArray_DATA(x_inout);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeBackward(hand, x_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(x_shape_mkl != (MKL_LONG*) x_shape) mkl_free(x_shape_mkl);

    return status;

}


#line 1352
int
cfloat_cfloat_mkl_fftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex8 *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(MKL_Complex8));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (MKL_Complex8*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1352
int
cdouble_cdouble_mkl_fftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex16 *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(MKL_Complex16));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (MKL_Complex16*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1352
int
cfloat_cfloat_mkl_ifftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex8 *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(MKL_Complex8));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_COMPLEX,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (MKL_Complex8*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeBackward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1352
int
cdouble_cdouble_mkl_ifftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    MKL_Complex16 *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(MKL_Complex16));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            // TODO: should rather use alloca or something optimized for small allocations
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_COMPLEX,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, 1.0 / (fsc * xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (MKL_Complex16*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeBackward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}


/* -======================================- */
/*  multivariate forward FFT on real input  */
/* -======================================- */

#line 1486
int
float_cfloat_mkl_fftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    float *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(float));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_REAL,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            /* TODO: should rather use alloca or something optimized for small allocations */
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_REAL,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, (FALSE) ? 1.0/(fsc * xin_size) :  fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, (FALSE) ? fsc : 1.0 / (fsc*xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (float*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

    /* copy conjugate even harmonics */
    {
        multi_iter_t mit;
        npy_intp *half_shape;
        npy_intp n_last, nh_last;
        int i, last_idx = xout_rank - 1;

        half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

        memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
        n_last = xout_shape[last_idx];
        nh_last = (n_last/2) + 1;
        half_shape[last_idx] = (n_last > 2) ? n_last - nh_last: 0;

        multi_iter_new(&mit, half_shape, xout_rank);
	mkl_free(half_shape);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;
            MKL_Complex8 *dest, *src;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                i = 0; i < xout_rank; i++) {
                npy_intp si = xout_strides[i],
                         ni = xout_shape[i],
                         src_ki = MultiIter_IndexElem(mit, i),
                         dest_ki;

                dest_ki = src_ki;
                if (i == last_idx) {
                    dest_ki += nh_last;
                }
                assert(dest_ki >= 0);
                assert(dest_ki < ni);
                src_ki = (dest_ki) ? ni - dest_ki : dest_ki;

                tmp1 += si * dest_ki;
                tmp2 += si * src_ki;
            }

            dest = (MKL_Complex8*) tmp1;
            src  = (MKL_Complex8*) tmp2;
            SET_CONJ(dest, src);

            if (multi_iter_next(&mit))
                break;
        }

        multi_iter_free(&mit);
   }

 
    if (FALSE) {
       Py_BEGIN_ALLOW_THREADS
       vmcConj(xout_size, xout_data, xout_data, VML_HA);
       Py_END_ALLOW_THREADS
    }

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1486
int
double_cdouble_mkl_fftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    double *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(double));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_REAL,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            /* TODO: should rather use alloca or something optimized for small allocations */
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_REAL,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, (FALSE) ? 1.0/(fsc * xin_size) :  fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, (FALSE) ? fsc : 1.0 / (fsc*xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (double*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

    /* copy conjugate even harmonics */
    {
        multi_iter_t mit;
        npy_intp *half_shape;
        npy_intp n_last, nh_last;
        int i, last_idx = xout_rank - 1;

        half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

        memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
        n_last = xout_shape[last_idx];
        nh_last = (n_last/2) + 1;
        half_shape[last_idx] = (n_last > 2) ? n_last - nh_last: 0;

        multi_iter_new(&mit, half_shape, xout_rank);
	mkl_free(half_shape);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;
            MKL_Complex16 *dest, *src;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                i = 0; i < xout_rank; i++) {
                npy_intp si = xout_strides[i],
                         ni = xout_shape[i],
                         src_ki = MultiIter_IndexElem(mit, i),
                         dest_ki;

                dest_ki = src_ki;
                if (i == last_idx) {
                    dest_ki += nh_last;
                }
                assert(dest_ki >= 0);
                assert(dest_ki < ni);
                src_ki = (dest_ki) ? ni - dest_ki : dest_ki;

                tmp1 += si * dest_ki;
                tmp2 += si * src_ki;
            }

            dest = (MKL_Complex16*) tmp1;
            src  = (MKL_Complex16*) tmp2;
            SET_CONJ(dest, src);

            if (multi_iter_next(&mit))
                break;
        }

        multi_iter_free(&mit);
   }

 
    if (FALSE) {
       Py_BEGIN_ALLOW_THREADS
       vmzConj(xout_size, xout_data, xout_data, VML_HA);
       Py_END_ALLOW_THREADS
    }

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1486
int
float_cfloat_mkl_ifftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    float *xin_data = 0;
    MKL_Complex8 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(float));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex8));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_REAL,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            /* TODO: should rather use alloca or something optimized for small allocations */
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_SINGLE,
                DFTI_REAL,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, (TRUE) ? 1.0/(fsc * xin_size) :  fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, (TRUE) ? fsc : 1.0 / (fsc*xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (float*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex8*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

    /* copy conjugate even harmonics */
    {
        multi_iter_t mit;
        npy_intp *half_shape;
        npy_intp n_last, nh_last;
        int i, last_idx = xout_rank - 1;

        half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

        memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
        n_last = xout_shape[last_idx];
        nh_last = (n_last/2) + 1;
        half_shape[last_idx] = (n_last > 2) ? n_last - nh_last: 0;

        multi_iter_new(&mit, half_shape, xout_rank);
	mkl_free(half_shape);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;
            MKL_Complex8 *dest, *src;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                i = 0; i < xout_rank; i++) {
                npy_intp si = xout_strides[i],
                         ni = xout_shape[i],
                         src_ki = MultiIter_IndexElem(mit, i),
                         dest_ki;

                dest_ki = src_ki;
                if (i == last_idx) {
                    dest_ki += nh_last;
                }
                assert(dest_ki >= 0);
                assert(dest_ki < ni);
                src_ki = (dest_ki) ? ni - dest_ki : dest_ki;

                tmp1 += si * dest_ki;
                tmp2 += si * src_ki;
            }

            dest = (MKL_Complex8*) tmp1;
            src  = (MKL_Complex8*) tmp2;
            SET_CONJ(dest, src);

            if (multi_iter_next(&mit))
                break;
        }

        multi_iter_free(&mit);
   }

 
    if (TRUE) {
       Py_BEGIN_ALLOW_THREADS
       vmcConj(xout_size, xout_data, xout_data, VML_HA);
       Py_END_ALLOW_THREADS
    }

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}

#line 1486
int
double_cdouble_mkl_ifftnd_out(
    PyArrayObject *x_in, PyArrayObject *x_out, double fsc)
{
    DFTI_DESCRIPTOR_HANDLE hand = 0;
    MKL_LONG status = 0;
    double *xin_data = 0;
    MKL_Complex16 *xout_data = 0;
    MKL_LONG *input_strides = 0, *output_strides = 0;
    npy_intp *xin_shape = 0, *xin_strides = 0, *xout_shape = 0, *xout_strides = 0;
    MKL_LONG *xin_shape_mkl = 0;
    int i, xin_rank, xout_rank;
    npy_intp xin_size, xout_size, xin_itemsize, xout_itemsize;

    get_basic_array_data(x_in, &xin_rank, &xin_shape,
                         &xin_strides, &xin_itemsize, &xin_size);
    get_basic_array_data(x_out, &xout_rank, &xout_shape,
                         &xout_strides, &xout_itemsize, &xout_size);

    if(xin_size == 0) {
	/* nothing to do */
	assert(xout_size == 0);
	return status;
    }

    assert(xin_size > 0);
    assert(xin_itemsize == sizeof(double));
    assert(xin_size == xout_size);
    assert(xout_itemsize == sizeof(MKL_Complex16));
    assert(xin_rank == xout_rank);

    if (xin_rank == 1) {
        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_REAL,
                1,
                _to_mkl_long(xin_shape[0])
                );
    } else {
        if(NPY_LIKELY(sizeof(MKL_LONG) == sizeof(npy_intp))) {
            xin_shape_mkl = (MKL_LONG*) xin_shape;
        } else {
            /* TODO: should rather use alloca or something optimized for small allocations */
            xin_shape_mkl = (MKL_LONG *) mkl_malloc(xin_rank * sizeof(MKL_LONG), 64);
            if (!xin_shape_mkl) goto cleanup;
            for(i = 0; i < xin_rank; i++)
                xin_shape_mkl[i] = _to_mkl_long(xin_shape[i]);
        }

        status =
            DftiCreateDescriptor(
                &hand,
                DFTI_DOUBLE,
                DFTI_REAL,
                xin_rank,
                xin_shape_mkl
                );
    }
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_FORWARD_SCALE, (TRUE) ? 1.0/(fsc * xin_size) :  fsc);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_BACKWARD_SCALE, (TRUE) ? fsc : 1.0 / (fsc*xin_size));
    if (0 != status) goto cleanup;

    input_strides = (MKL_LONG *) mkl_malloc((xin_rank + 1) * sizeof(MKL_LONG), 64);
    if (!input_strides) goto cleanup;

    output_strides = (MKL_LONG *) mkl_malloc((xout_rank + 1) * sizeof(MKL_LONG), 64);
    if (!output_strides) goto cleanup;

    input_strides[0] = 0;
    output_strides[0] = 0;
    for(i = 0; i < xin_rank; i++) {
        assert( xin_strides[i] % xin_itemsize == 0 );
        assert( xout_strides[i] % xout_itemsize == 0 );

        input_strides[i + 1] = _to_mkl_long (xin_strides[i] / xin_itemsize);
        output_strides[i + 1] = _to_mkl_long (xout_strides[i] / xout_itemsize);
    }

    status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_INPUT_STRIDES, input_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_OUTPUT_STRIDES, output_strides);
    if (0 != status) goto cleanup;

    status = DftiSetValue(hand, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
    if (0 != status) goto cleanup;

    xin_data = (double*) PyArray_DATA(x_in);
    xout_data = (MKL_Complex16*) PyArray_DATA(x_out);

    Py_BEGIN_ALLOW_THREADS
    status = DftiCommitDescriptor(hand);
    if (0 == status) {
        status = DftiComputeForward(hand, xin_data, xout_data);
    }
    Py_END_ALLOW_THREADS
    if (0 != status) goto cleanup;

    /* copy conjugate even harmonics */
    {
        multi_iter_t mit;
        npy_intp *half_shape;
        npy_intp n_last, nh_last;
        int i, last_idx = xout_rank - 1;

        half_shape = (npy_intp *) mkl_malloc(xout_rank * sizeof(npy_intp), 64);

        memcpy(half_shape, xout_shape, xout_rank * sizeof(npy_intp));
        n_last = xout_shape[last_idx];
        nh_last = (n_last/2) + 1;
        half_shape[last_idx] = (n_last > 2) ? n_last - nh_last: 0;

        multi_iter_new(&mit, half_shape, xout_rank);
	mkl_free(half_shape);

        while(!MultiIter_Done(mit)) {
            char *tmp1, *tmp2;
            MKL_Complex16 *dest, *src;

#if defined(__ICC) || defined(__INTEL_COMPILER)
            #pragma novector
#endif
            for(tmp1 = (char *) xout_data, tmp2 = (char *) xout_data,
                i = 0; i < xout_rank; i++) {
                npy_intp si = xout_strides[i],
                         ni = xout_shape[i],
                         src_ki = MultiIter_IndexElem(mit, i),
                         dest_ki;

                dest_ki = src_ki;
                if (i == last_idx) {
                    dest_ki += nh_last;
                }
                assert(dest_ki >= 0);
                assert(dest_ki < ni);
                src_ki = (dest_ki) ? ni - dest_ki : dest_ki;

                tmp1 += si * dest_ki;
                tmp2 += si * src_ki;
            }

            dest = (MKL_Complex16*) tmp1;
            src  = (MKL_Complex16*) tmp2;
            SET_CONJ(dest, src);

            if (multi_iter_next(&mit))
                break;
        }

        multi_iter_free(&mit);
   }

 
    if (TRUE) {
       Py_BEGIN_ALLOW_THREADS
       vmzConj(xout_size, xout_data, xout_data, VML_HA);
       Py_END_ALLOW_THREADS
    }

  cleanup:
    if(hand) DftiFreeDescriptor(&hand);
    if(input_strides) mkl_free(input_strides);
    if(output_strides) mkl_free(output_strides);
    if(xin_shape_mkl != (MKL_LONG *)xin_shape) mkl_free(xin_shape_mkl);


    return status;

}


