
    ܙd                     Z   d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	 Zed
             Zed             Zed             Z e ej        e          dd          d             Zed             Zd Zed             Zed             Zed             ZdS )    )ir)cudatypes)cgutils)RequireLiteralValue)	signature)overload_attribute)	nvvmutils)	intrinsicc                     | j         }|dk    rt          j        }n3|dv r t          j        t          j        |          }nt	          d          t          |t          j                  S )N   )      zargument can only be 1, 2, 3)literal_valuer   int32UniTuple
ValueErrorr   )ndimvalrestypes      5lib/python3.11/site-packages/numba/cuda/intrinsics.py_type_grid_functionr      s[    

C
axx+	.c227888Wek***    c                     t          |t          j                  st          |          t	          |          }d }||fS )a  grid(ndim)

    Return the absolute position of the current thread in the entire grid of
    blocks.  *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                     |j         }|t          j        k    rt          j        |d          S t          |t          j                  r0t          j        ||j                  }t          j	        ||          S d S )Nr   )dim)
return_typer   r   r
   get_global_id
isinstancer   countr   
pack_array)contextbuildersigargsr   idss         r   codegenzgrid.<locals>.codegen1   st    /ek!!*7::::00 	4)'w}EEEC%gs333	4 	4r   r   r   IntegerLiteralr   r   )	typingctxr   r$   r'   s       r   gridr+      sO    " dE011 (!$'''
d
#
#C4 4 4 <r   c                     t          |t          j                  st          |          t	          |          }d fd}||fS )a  gridsize(ndim)

    Return the absolute size (or shape) in threads of the entire grid of
    blocks. *ndim* should correspond to the number of dimensions declared when
    instantiating the kernel. If *ndim* is 1, a single integer is returned.
    If *ndim* is 2 or 3, a tuple of the given number of integers is returned.

    Computation of the first integer is as follows::

        cuda.blockDim.x * cuda.gridDim.x

    and is similar for the other two indices, but using the ``y`` and ``z``
    attributes.
    c                     t          j        | d|           }t          j        | d|           }|                     ||          S )Nzntid.znctaid.)r
   	call_sregmul)r#   r   ntidnctaids       r   _nthreads_for_dimz#gridsize.<locals>._nthreads_for_dimR   sF    "7MCMM::$Wooo>>{{4(((r   c                 D   |j         } |d          }|t          j        k    r|S t          |t          j                  r] |d          }|j        dk    rt          j        |||f          S |j        dk    r& |d          }t          j        ||||f          S d S d S )Nxyr   r   z)r   r   r   r   r   r    r   r!   )	r"   r#   r$   r%   r   nxnynzr2   s	           r   r'   zgridsize.<locals>.codegenW   s    /w,,ek!!I00 	A""7C00B}!!)'B8<<<!##&&w44)'BB<@@@	A 	A
 $#r   r(   )r*   r   r$   r'   r2   s       @r   gridsizer:   <   sn    " dE011 (!$'''
d
#
#C) ) )
A A A A A <r   c                 B    t          t          j                  }d }||fS )Nc                 ,    t          j        |d          S )Nwarpsize)r
   r.   )r"   r#   r$   r%   s       r   r'   z_warpsize.<locals>.codegenm   s    "7J777r   )r   r   r   r*   r$   r'   s      r   	_warpsizer?   i   s,    
EK
 
 C8 8 8 <r   r=   r   )targetc                     d }|S )z_
    The size of a warp. All architectures implemented to date have a warp size
    of 32.
    c                     t                      S )N)r?   )mods    r   getzcuda_warpsize.<locals>.gety   s    {{r    )rC   rD   s     r   cuda_warpsizerF   s   s      Jr   c                 B    t          t          j                  }d }||fS )a  
    Synchronize all threads in the same thread block.  This function implements
    the same pattern as barriers in traditional multi-threaded programming: this
    function waits until all threads in the block call it, at which point it
    returns control to all its callers.
    c                     d}|j         }t          j        t          j                    d          }t	          j        |||          }|                    |d           |                                 S )Nzllvm.nvvm.barrier0rE   )moduler   FunctionTypeVoidTyper   get_or_insert_functioncallget_dummy_value)r"   r#   r$   r%   fnamelmodfntysyncs           r   r'   zsyncthreads.<locals>.codegen   s_    $~r{}}b11-dD%@@T2&&(((r   )r   r   noner>   s      r   syncthreadsrT      s.     EJ

C) ) ) <r   c                     t          |t          j                  sd S t          t          j        t          j                  }fd}||fS )Nc                     t          j        t          j        d          t          j        d          f          }t          j        |j        |          }|                    ||          S )N    )r   rJ   IntTyper   rL   rI   rM   )r"   r#   r$   r%   rQ   rR   rO   s         r   r'   z'_syncthreads_predicate.<locals>.codegen   sQ    rz"~~
2/@AA-gndEJJ||D$'''r   )r   r   Integerr   i4)r*   	predicaterO   r$   r'   s     `  r   _syncthreads_predicater\      sT    i// t
EHeh
'
'C( ( ( ( (
 <r   c                 (    d}t          | ||          S )z
    syncthreads_count(predicate)

    An extension to numba.cuda.syncthreads where the return value is a count
    of the threads where predicate is true.
    zllvm.nvvm.barrier0.popcr\   r*   r[   rO   s      r   syncthreads_countr`      s     &E!)Y>>>r   c                 (    d}t          | ||          S )z
    syncthreads_and(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for all threads or 0 otherwise.
    zllvm.nvvm.barrier0.andr^   r_   s      r   syncthreads_andrb      s     %E!)Y>>>r   c                 (    d}t          | ||          S )z
    syncthreads_or(predicate)

    An extension to numba.cuda.syncthreads where 1 is returned if predicate is
    true for any thread or 0 otherwise.
    zllvm.nvvm.barrier0.orr^   r_   s      r   syncthreads_orrd      s     $E!)Y>>>r   N)llvmliter   numbar   r   
numba.corer   numba.core.errorsr   numba.core.typingr   numba.core.extendingr	   
numba.cudar
   numba.cuda.extendingr   r   r+   r:   r?   ModulerF   rT   r\   r`   rb   rd   rE   r   r   <module>rn      s                       1 1 1 1 1 1 ' ' ' ' ' ' 3 3 3 3 3 3             * * * * * *	+ 	+ 	+   @ ) ) )X    LEL&&
6BBB  CB   (   ? ? ? ? ? ? ? ? ? ? ?r   