
    ܙdQ                        d dl Zd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m
Z* d dl+m,Z, d dl+m-Z- d dl.m/Z/ g dZ0 G d dej1                  Z2 G d de3          Z4 G d d          Z5 G d de          Z6 G d de          Z7 G d deej1                  Z8dS )     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarning)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                   \    e Zd ZdZe	 	 	 d fd	            Zed             Zed             Zd Z	ed	             Z
ed
             Ze fd            Zd Zd Zed             Zed             Zed             Zed             Zed             Zd Zd Zd ZddZddZddZd Z xZS )_Kernelz
    CUDA Kernel specialized for a given set of argument types. When called, this
    object launches the kernel on the device.
    NFTc                    |rt          d          t                                                       d| _        d | _        || _        || _        || _        || _        |pg | _	        ||
rdndd}t                      j        }t          | j        t          j        | j        | j        |||||	  	        }|j        }| j        j        }|j        }|j        }|                    |j        |j        ||||||	          \  }|sg }d                                v | _        | j        rd_        fd	t2          D             }|rt4          j        sd
}d| }t9          |          t:          j                            t:          j                             tB                              }t:          j        "                    |d          }|#                    |           |D ]}$                    |           |j%        | _&        |j'        | _'        |j(        | _)        | _*        |j+        | _+        || _        |j        | _        |j,        | _,        g | _-        g | _.        g | _/        d S )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner1   nvvm_optionscccudaCGGetIntrinsicHandleTc                 F    g | ]}d |                                  v |S )__numba_wrapper_)get_asm_str).0fnlibs     5lib/python3.11/site-packages/numba/cuda/dispatcher.py
<listcomp>z$_Kernel.__init__.<locals>.<listcomp>l   sB     B B Bb*b**coo.?.??? ???    z9https://numba.readthedocs.io/en/stable/cuda/bindings.htmlzUse of float16 requires the use of the NVIDIA CUDA bindings and setting the NUMBA_CUDA_USE_NVIDIA_BINDING environment variable to 1. Relevant documentation is available here:
zcpp_function_wrappers.cu)0RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr4   r5   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescr<   cooperativeneeds_cudadevrtcuda_fp16_math_funcsr   CUDA_USE_NVIDIA_BINDINGNotImplementedErrorospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfrH   rI   linkr4   r5   r6   r1   rJ   max_registersr2   devicer7   r8   crestgt_ctxcodefilenamelinenumkernelressmsgbasedirfunctions_cu_pathfilepathr?   	__class__s                             @r@   rE   z_Kernel.__init__.   s   
  	OMNNN     
 $* !?11
 

  !!4DL%*dm"&*%-#)%-)5!# # # %|$#%11$,27<2:G2?A AV
  	D 69J9JJ 	'"&CB B B B0 B B B  	+1 /O 	 
 *#... goobgooh&?&?@@G "W-G!I !IKK)*** 	+ 	+H  **** !+ $ 4+ &k+(*%rB   c                     | j         S N)rf   rl   s    r@   rR   z_Kernel.library   s      rB   c                     | j         S r~   )re   r   s    r@   rd   z_Kernel.type_annotation   s    $$rB   c                     | j         S r~   )ri   r   s    r@   _find_referenced_environmentsz%_Kernel._find_referenced_environments   s    ,,rB   c                 4    | j                                         S r~   )rM   codegenr   s    r@   r   z_Kernel.codegen   s    "**,,,rB   c                 4    t          | j        j                  S r~   )tuplerc   argsr   s    r@   argument_typesz_Kernel.argument_types   s    T^()))rB   c	                    |                      |           }	t          | |	                                           d|	_        ||	_        ||	_        ||	_        d|	_        ||	_        ||	_	        ||	_
        ||	_        ||	_        |	S )&
        Rebuild an instance.
        N)__new__rD   rE   rG   rT   rb   rc   re   rf   r4   r5   rg   rJ   )clsrT   ra   rc   codelibraryr4   r5   rg   rJ   instancer|   s             r@   _rebuildz_Kernel._rebuild   s     ;;s##c8%%'''#*"&$(! +$*(rB   c           
          t          | j        | j        | j        | j        | j        | j        | j        | j                  S )a  
        Reduce the instance for serialization.
        Compiled definitions are serialized in PTX form.
        Type annotation are discarded.
        Thread, block and shared memory configuration are serialized.
        Stream information is discarded.
        )rT   ra   rc   r   r4   r5   rg   rJ   )	dictrT   rb   rc   rf   r4   r5   rg   rJ   r   s    r@   _reduce_statesz_Kernel._reduce_states   sE      0t"n$:K*t} $ 0T_N N N 	NrB   c                 8    | j                                          dS )z7
        Force binding to current CUDA context
        N)rf   
get_cufuncr   s    r@   bindz_Kernel.bind   s     	$$&&&&&rB   c                 H    | j                                         j        j        S )zN
        The number of registers used by each thread for this kernel.
        )rf   r   attrsregsr   s    r@   regs_per_threadz_Kernel.regs_per_thread   s    
  ++--388rB   c                 H    | j                                         j        j        S )zD
        The amount of constant memory used by this kernel.
        )rf   r   r   constr   s    r@   const_mem_sizez_Kernel.const_mem_size       
  ++--399rB   c                 H    | j                                         j        j        S )zM
        The amount of shared memory used per block for this kernel.
        )rf   r   r   sharedr   s    r@   shared_mem_per_blockz_Kernel.shared_mem_per_block   s    
  ++--3::rB   c                 H    | j                                         j        j        S )z:
        The maximum allowable threads per block.
        )rf   r   r   
maxthreadsr   s    r@   max_threads_per_blockz_Kernel.max_threads_per_block   s    
  ++--3>>rB   c                 H    | j                                         j        j        S )zM
        The amount of local memory used per thread for this kernel.
        )rf   r   r   localr   s    r@   local_mem_per_threadz_Kernel.local_mem_per_thread   r   rB   c                 4    | j                                         S )z6
        Returns the LLVM IR for this kernel.
        )rf   get_llvm_strr   s    r@   inspect_llvmz_Kernel.inspect_llvm   s      --///rB   c                 8    | j                             |          S )z7
        Returns the PTX code for this kernel.
        )r8   )rf   r<   )rl   r8   s     r@   inspect_asmz_Kernel.inspect_asm   s      ,,,333rB   c                 4    | j                                         S )zp
        Returns the SASS code for this kernel.

        Requires nvdisasm to be available on the PATH.
        )rf   get_sassr   s    r@   inspect_sassz_Kernel.inspect_sass   s      ))+++rB   c                     | j         t          d          |t          j        }t	          | j        d| j        |           t	          d|           t	          | j         |           t	          d|           dS )
        Produce a dump of the Python source of this function annotated with the
        corresponding Numba IR and type information. The dump is written to
        *file*, or *sys.stdout* if *file* is *None*.
        Nz Type annotation is not available filezP--------------------------------------------------------------------------------zP================================================================================)re   
ValueErrorsysstdoutprintrb   r   )rl   r   s     r@   inspect_typesz_Kernel.inspect_types  s      (?@@@<:D$*=*=>TJJJJhT""""d#$////hT""""""rB   r   c                     t                      }| j                                        }t          |t                    rt          j        d |          }|                    |||          }|j        j	        }||z  S )a  
        Calculates the maximum number of blocks that can be launched for this
        kernel in a cooperative grid in the current context, for the given block
        and dynamic shared memory sizes.

        :param blockdim: Block dimensions, either as a scalar for a 1D block, or
                         a tuple for 2D or 3D blocks.
        :param dynsmemsize: Dynamic shared memory size in bytes.
        :return: The maximum number of blocks in the grid.
        c                     | |z  S r~    )xys     r@   <lambda>z5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>(  s
    QU rB   )
r   rf   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorro   MULTIPROCESSOR_COUNT)rl   blockdimdynsmemsizectxcufuncactive_per_smsm_counts          r@   max_cooperative_grid_blocksz#_Kernel.max_cooperative_grid_blocks  s     mm"--//h&& 	F '(:(:HEEH@@AIALN N :2x''rB   c                 J   | j                                         | j        ruj        dz   }j                            |          \  }}|t          j        t          j                  k    sJ t          j                    }	|	                    d|           g }
g }t          | j        |          D ]\  }}|                     ||||
|           t          j        r t          j                            d          }nd }|r|j        p|}t          j        j        g|||||R d| j        i | j        rt          j        t          j        |	          ||           |	j        dk    rfdfddD             }fddD             }|	j        }| j                            |          \  }}}|d	}n1|\  }}}t2          j                            |          }d
|d|d|d}|d|d|}|r|d|d         f|dd          z   }n|f} || |
D ]} |             d S )N__errcode__r   )streamrT   c                     j                             j        d| d          \  }}t          j                    }t          j        t          j        |          ||           |j        S )N__)	moduleget_global_symbolra   ctypesc_intr   device_to_host	addressofvalue)ra   memszvalr   s       r@   load_symbolz#_Kernel.launch.<locals>.load_symbolU  sk    $m==?E{{{?Ctt?E F FGC !,..C)&*:3*?*?bIII9$rB   c                 ,    g | ]} d |z             S )tidr   r=   ir   s     r@   rA   z"_Kernel.launch.<locals>.<listcomp>]  s'    ===!{{519--===rB   zyxc                 ,    g | ]} d |z             S )ctaidr   r   s     r@   rA   z"_Kernel.launch.<locals>.<listcomp>^  s'    AAAaWq[11AAArB    zIn function z, file z, line z, ztid=z ctaid=z:    )rf   r   r4   ra   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrT   r   r   r   rg   get_exceptionrY   rZ   r\   )rl   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   r   rr   excclsexc_argsloclocinfosymr{   linenoprefixwbr   r   s                               @@r@   launchz_Kernel.launch/  s   "--//: 	,kM1G"M;;GDDMFEFM&,777777\^^FMM!FM+++ 
+T22 	? 	?DAqq!VT:>>>>  	 .11!44KKK06=?K 	V] 	;%	;&	; '	; +		;
 (	; 	; 	; *.)9	; 	; 	; :  	(!&"26":":FEJJJ|q  % % % % % >===u===AAAA5AAA|(,(8(F(Ft(L(L%#; GG,/)C6!wx88HHFIccFNhhFLffOG 18eeD ',2FFHQKK @B  %HH  &wHfh''  	 	BBDDDD	 	rB   c                 ,   t          | j                  D ]}|                    ||||          \  }}t          |t          j                  rt          |                              ||          }t          j	        }t          j
        d          }	t          j
        d          }
 ||j                  } ||j        j                  }t          j        |          }t          j        rt#          |          }t          j
        |          }|                    |	           |                    |
           |                    |           |                    |           |                    |           t'          |j                  D ]+}|                     ||j        |                              ,t'          |j                  D ]+}|                     ||j        |                              ,dS t          |t          j                  r8 t1          t          d|z            |          }|                    |           dS |t          j        k    rZt          j        t7          j        |                              t6          j                            }|                    |           dS |t          j        k    r+t          j        |          }|                    |           dS |t          j         k    r+t          j!        |          }|                    |           dS |t          j"        k    r8t          j#        t#          |                    }|                    |           dS |t          j$        k    rZ|                    t          j!        |j%                             |                    t          j!        |j&                             dS |t          j'        k    rZ|                    t          j        |j%                             |                    t          j        |j&                             dS t          |t          j(        t          j)        f          rF|                    t          j*        |                    t6          j+                                       dS t          |t          j,                  rnt          |                              ||          }|j-        }t          j        r!t          j
        t#          |                    }|                    |           dS t          |t          j.                  rSt_          |          t_          |          k    sJ ta          ||          D ]\  }}| 1                    |||||           dS t          |t          j2                  rD	 | 1                    |j        |j3        |||           dS # th          $ r ti          ||          w xY wti          ||          )zF
        Convert arguments to ctypes and append to kernelargs
        )r   r   r   zc_%sN)5reversedrJ   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intr_   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   rX   )rl   tyr   r   r   r   	extensiondevaryc_intpmeminfoparentnitemsr  ptrdataaxcvaldevrecr   r   s                       r@   r   z_Kernel._prepare_argsw  s    "$/22 	 	I,,	 -  GB b%+&& O	/c]],,T6::F%Foa((G_Q''FVFK((Fvfl344H'//C$ #hh?3''Dg&&&f%%%f%%%h'''d###FK(( < <!!&&b)9":":;;;;FK(( > >!!&&);"<"<====> > EM** 4	//766B;//44Dd#####5=  ?2:c??#7#7	#B#BCCDd#####5=  ?3''Dd#####5=  >#&&Dd#####5=  >#c((++Dd#####5?""fnSX66777fnSX66777775###foch77888foch7788888U-u/@ABB 	/fnSXXbh-?-?@@AAAAAEL)) 	/c]],,T6::F.C$ 0oc#hh//c"""""EO,, 	/r77c#hh&&&&B C C1""1azBBBBC C E,-- 		/3""Hciz     ' 3 3 3)"c2223 &b#...s   #W( (X)	NFFFFNNTFr~   )r   r   r   )__name__
__module____qualname____doc__r   rE   propertyrR   rd   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r  r   __classcell__r|   s   @r@   r.   r.   (   s0        
 ;@JN6;d d d d d dL ! ! X! % % X%- - - - - X- * * X*     [*N N N' ' ' 9 9 X9 : : X: ; ; X; ? ? X? : : X:0 0 04 4 4, , ,# # # #"( ( ( (,F F F FP\/ \/ \/ \/ \/ \/ \/rB   r.   c                        e Zd Zd Zd Zd ZdS )ForAllc                 |    |dk     rt          d|z            || _        || _        || _        || _        || _        d S )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )rl   rE  rF  tpbr   r   s         r@   rE   zForAll.__init__  sP    A::O%& ' ' '$ #"rB   c                     | j         dk    rd S | j        j        r| j        }n | j        j        | }|                     |          }| j         |z   dz
  |z  } |||| j        | j        f         | S )Nr   r   )rF  rE  specialized
specialize_compute_thread_per_blockr   r   )rl   r   rJ  r   r   s        r@   __call__zForAll.__call__  s    ;!F?& 	</KK4$/4d;K11+>>;)A-(:+{7Hdk>* +,02 	2rB   c                 $   | j         }|dk    r|S t                      }t          t          |j                                                            }t          |j                                        d| j	        d          } |j
        di |\  }}|S )Nr   i   )funcb2d_funcmemsizeblocksizelimitr   )rG  r   nextiter	overloadsvaluesr   rf   r   r   get_max_potential_block_size)rl   rE  rH  r   ru   kwargs_s          r@   rL  z ForAll._compute_thread_per_block  s    #!88J --C $z3::<<==>>F(3355#	  F 6S5????FAsJrB   N)r:  r;  r<  rE   rM  rL  r   rB   r@   rC  rC    sA        # # #2 2 2    rB   rC  c                       e Zd Zd Zd ZdS )_LaunchConfigurationc                     || _         || _        || _        || _        || _        t
          j        rFd}|d         |d         z  |d         z  }||k     r&d| d}t          t          |                     d S d S d S )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	rE  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	rl   rE  r   r   r   r   min_grid_size	grid_sizerx   s	            r@   rE   z_LaunchConfiguration.__init__  s    $ "- 	3  M
WQZ/'!*<I=((AI A A A,S1122222	3 	3 )(rB   c                 f    | j                             || j        | j        | j        | j                  S r~   )rE  callr   r   r   r   rl   r   s     r@   rM  z_LaunchConfiguration.__call__  s2    ##D$,$(KA A 	ArB   N)r:  r;  r<  rE   rM  r   rB   r@   r[  r[    s7        3 3 3.A A A A ArB   r[  c                        e Zd Zd Zd Zd ZdS )CUDACacheImplc                 *    |                                 S r~   )r   )rl   ru   s     r@   r   zCUDACacheImpl.reduce"  s    $$&&&rB   c                 $    t          j        di |S )Nr   )r.   r   )rl   rM   payloads      r@   rebuildzCUDACacheImpl.rebuild%  s    **'***rB   c                     dS )NTr   )rl   rp   s     r@   check_cachablezCUDACacheImpl.check_cachable(  s	     trB   N)r:  r;  r<  r   rj  rl  r   rB   r@   rf  rf  !  sA        ' ' '+ + +    rB   rf  c                       e Zd ZdZeZdS )	CUDACachezS
    Implements a cache that saves and loads CUDA kernels and compile results.
    N)r:  r;  r<  r=  rf  _impl_classr   rB   r@   rn  rn  3  s           KKKrB   rn  c                   V    e Zd ZdZdZeZef fd	Ze	d             Z
d Z ej        d          d#d	            Zd
 Zd$dZe	d             Zd Zd Zd Zd Zd Ze	d             Zd%dZd%dZd%dZd%dZd%dZd Zd%dZd Zd Z d%dZ!d%dZ"d%dZ#d%d Z$e%d!             Z&d" Z' xZ(S )&CUDADispatchera  
    CUDA Dispatcher object. When configured and called, the dispatcher will
    specialize itself for the given arguments (if no suitable specialized
    version already exists) & compute capability, and launch on the device
    associated with the current context.

    Dispatcher objects are not to be constructed by the user, but instead are
    created using the :func:`numba.cuda.jit` decorator.
    Fc                 l    t                                          |||           d| _        i | _        d S )N)targetoptionspipeline_classF)rD   rE   _specializedspecializations)rl   rH   rs  rt  r|   s       r@   rE   zCUDADispatcher.__init__L  sE    (6 	 	8 	8 	8 "  "rB   c                 *    t          j        |           S r~   )
cuda_typesrq  r   s    r@   _numba_type_zCUDADispatcher._numba_type_\  s    (...rB   c                 8    t          | j                  | _        d S r~   )rn  rH   _cacher   s    r@   enable_cachingzCUDADispatcher.enable_caching`  s    --rB   r]  )maxsizer   c                 N    t          ||          \  }}t          | ||||          S r~   )r   r[  )rl   r   r   r   r   s        r@   	configurezCUDADispatcher.configurec  s,    7JJ#D'8VYOOOrB   c                 V    t          |          dvrt          d           | j        | S )N)r^  r0      z.must specify at least the griddim and blockdim)r+  r   r  rd  s     r@   __getitem__zCUDADispatcher.__getitem__h  s1    t99I%%MNNNt~t$$rB   c                 *    t          | ||||          S )a3  Returns a 1D-configured dispatcher for a given number of tasks.

        This assumes that:

        - the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
          1-1 basis.
        - the kernel checks that the Global Thread ID is upper-bounded by
          ``ntasks``, and does nothing if it is not.

        :param ntasks: The number of tasks.
        :param tpb: The size of a block. An appropriate value is chosen if this
                    parameter is not supplied.
        :param stream: The stream on which the configured dispatcher will be
                       launched.
        :param sharedmem: The number of bytes of dynamic shared memory required
                          by the kernel.
        :return: A configured dispatcher, ready to launch on a set of
                 arguments.)rH  r   r   )rC  )rl   rF  rH  r   r   s        r@   forallzCUDADispatcher.forallm  s    ( dFFiPPPPrB   c                 6    | j                             d          S )aS  
        A list of objects that must have a `prepare_args` function. When a
        specialized kernel is called, each argument will be passed through
        to the `prepare_args` (from the last object in this list to the
        first). The arguments to `prepare_args` are:

        - `ty` the numba type of the argument
        - `val` the argument value itself
        - `stream` the CUDA stream used for the current call to the kernel
        - `retr` a list of zero-arg functions that you may want to append
          post-call cleanup work to.

        The `prepare_args` function must return a tuple `(ty, val)`, which
        will be passed in turn to the next right-most `extension`. After all
        the extensions have been called, the resulting `(ty, val)` will be
        passed into Numba's default argument marshalling logic.
        rJ   )rs  getr   s    r@   rJ   zCUDADispatcher.extensions  s    & !%%l333rB   c                 *    t          t                    r~   )r   r   )rl   r   rX  s      r@   rM  zCUDADispatcher.__call__  s    2333rB   c                     | j         r4t          t          | j                                                            }nt          j        j        | g|R  }|                    |||||           dS )zJ
        Compile if necessary and invoke this kernel with *args*.
        N)	rJ  rS  rT  rU  rV  r   r   
_cuda_callr  )rl   r   r   r   r   r   ru   s          r@   rc  zCUDADispatcher.call  sp      	D$t~44667788FF +6tCdCCCFdGXvyAAAAArB   c                 l     |rJ  fd|D             }                      t          |                    S )Nc                 :    g | ]}                     |          S r   )typeof_pyvalr=   arl   s     r@   rA   z4CUDADispatcher._compile_for_args.<locals>.<listcomp>  s'    777QD%%a((777rB   )compiler   )rl   r   kwsrI   s   `   r@   _compile_for_argsz CUDADispatcher._compile_for_args  s=    7777$777||E(OO,,,rB   c                     	 t          |t          j                  S # t          $ rF t	          j        |          r0t          t	          j        |d          t          j                  cY S  w xY w)NF)sync)r   r   argumentr   r   is_cuda_arrayas_cuda_array)rl   r   s     r@   r  zCUDADispatcher.typeof_pyval  s    		#w/000 	 	 	!#&&  d05AAA%.0 0 0 0 0 	s    AA,*A,c                     t                      j        }t           fd|D                       } j        rt	          d           j                            ||f          }|r|S  j        }t           j	        |          }|
                    |           |                                 d|_        | j        ||f<   |S )zd
        Create a new instance of this dispatcher specialized for the given
        *args*.
        c                 D    g | ]}j                             |          S r   )	typingctxresolve_argument_typer  s     r@   rA   z-CUDADispatcher.specialize.<locals>.<listcomp>  s)    CCCT^11!44CCCrB   zDispatcher already specialized)rs  T)r   rK   r   rJ  rC   rv  r  rs  rq  rH   r  disable_compileru  )rl   r   r8   rI   specializationrs  s   `     r@   rK  zCUDADispatcher.specialize  s    
  !!4CCCCdCCCE E 	A?@@@-112x.AA 	"!!*'6CE E Ex(((&&(((&*#-;R\*rB   c                     | j         S )z>
        True if the Dispatcher has been specialized.
        )ru  r   s    r@   rJ  zCUDADispatcher.specialized  s    
   rB   Nc                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the number of registers used by each thread in this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get register
                          usage for. This may be omitted for a specialized
                          kernel.
        :return: The number of registers used by the compiled variant of the
                 kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r=   sigoverloads      r@   
<dictcomp>z6CUDADispatcher.get_regs_per_thread.<locals>.<dictcomp>  s7     A A A%X 1 A A ArB   )rU  r   r   rJ  rS  rT  rV  itemsrl   rc   s     r@   get_regs_per_threadz"CUDADispatcher.get_regs_per_thread  s      >).1AA 	AT^22445566FFA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of constant memory used by this kernel for
        the device in the current context.

        :param signature: The signature of the compiled kernel to get constant
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The size in bytes of constant memory allocated by the
                 compiled variant of the kernel for the given signature and
                 current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z5CUDADispatcher.get_const_mem_size.<locals>.<dictcomp>  s7     A A A%X 0 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_const_mem_sizez!CUDADispatcher.get_const_mem_size  s      >).1@@ 	AT^22445566EEA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of statically allocated shared memory
        for this kernel.

        :param signature: The signature of the compiled kernel to get shared
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of shared memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z;CUDADispatcher.get_shared_mem_per_block.<locals>.<dictcomp>  7     A A A%X 6 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_shared_mem_per_blockz'CUDADispatcher.get_shared_mem_per_block        >).1FF 	AT^22445566KKA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a(  
        Returns the maximum allowable number of threads per block
        for this kernel. Exceeding this threshold will result in
        the kernel failing to launch.

        :param signature: The signature of the compiled kernel to get the max
                          threads per block for. This may be omitted for a
                          specialized kernel.
        :return: The maximum allowable threads per block for the compiled
                 variant of the kernel for the given signature and current
                 device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z<CUDADispatcher.get_max_threads_per_block.<locals>.<dictcomp>&  s7     A A A%X 7 A A ArB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_max_threads_per_blockz(CUDADispatcher.get_max_threads_per_block  s      >).1GG 	AT^22445566LLA A)-)=)=)?)?A A A ArB   c                     || j         |j                 j        S | j        r8t	          t          | j                                                             j        S d | j                                         D             S )a  
        Returns the size in bytes of local memory per thread
        for this kernel.

        :param signature: The signature of the compiled kernel to get local
                          memory usage for. This may be omitted for a
                          specialized kernel.
        :return: The amount of local memory allocated by the compiled variant
                 of the kernel for the given signature and current device.
        Nc                 $    i | ]\  }}||j         S r   )r   r  s      r@   r  z;CUDADispatcher.get_local_mem_per_thread.<locals>.<dictcomp>9  r  rB   )rU  r   r   rJ  rS  rT  rV  r  r  s     r@   get_local_mem_per_threadz'CUDADispatcher.get_local_mem_per_thread)  r  rB   c                    | j         r"|                     t          |                     | j        j        }d                    |          }t          j        ||| j                  }t          j
        | j                  }||||fS )z
        Get a typing.ConcreteTemplate for this dispatcher and the given
        *args* and *kws* types.  This allows resolution of the return type.

        A (template, pysig, args, kws) tuple is returned.
        zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   rH   r:  formatr   make_concrete_templatenopython_signaturesr   pysignature)rl   r   r  	func_namera   call_templatepysigs          r@   get_call_templatez CUDADispatcher.get_call_template<  s      	-d,,, L)	")))445iD,DF F F!$,//eT3..rB   c                 R   || j         vr| j        5  | j                            d          }| j                            d          }| j                            d          }| j                            d          }| j                            d          rdnd|d}t	                      j        }t          | j        ||||||||		  	        }	|	| j         |<   |	j        	                    |	j
        |	j        |	j        g           d
d
d
           n# 1 swxY w Y   n| j         |         }	|	S )zCompile the device function for the given argument types.

        Each signature is compiled once by caching the compiled function inside
        this object.

        Returns the `CompileResult`.
        r4   r5   r6   r1   r2   r0   r   )r2   r1   r3   N)rU  _compiling_counterrs  r  r   rK   r   rH   rM   insert_user_functionrG   rS   rR   )
rl   r   return_typer4   r5   r6   r1   r7   r8   rp   s
             r@   r  zCUDADispatcher.compile_deviceW  s    t~%%( I I*..w77-11*==+//99-11*== !% 2 6 6u = =D111 (   
 ())<#DL+t*/-5+1-51=')+ + + (,t$#889I9=:>,I I I-I I I I I I I I I I I I I I I4 >$'Ds   C0DDDc                 b    d |D             }|                      ||d           || j        |<   d S )Nc                     g | ]	}|j         
S r   )_code)r=   r  s     r@   rA   z/CUDADispatcher.add_overload.<locals>.<listcomp>  s    +++Q+++rB   Tr   )_insertrU  )rl   ru   rI   c_sigs       r@   add_overloadzCUDADispatcher.add_overload~  s?    ++(+++UF...#)x   rB   c                    t          j        |          \  }}||t          j        k    sJ | j        r3t          t          | j                                                            S | j        	                    |          }||S | j
                            || j                  }|| j        |xx         dz  cc<   nr| j        |xx         dz  cc<   | j        st!          d          t#          | j        |fi | j        }|                                 | j
                            ||           |                     ||           |S )z
        Compile and bind to the current context a version of this kernel
        specialized for the given signature.
        Nr   zCompilation disabled)r   normalize_signaturer   nonerJ  rS  rT  rU  rV  r  r{  load_overload	targetctx_cache_hits_cache_missesr  rC   r.   rH   rs  r   save_overloadr  )rl   r  rI   r  ru   s        r@   r  zCUDADispatcher.compile  s\   
 !) <S A A+"kUZ&?&?&?&?  	T^224455666^''11F! **3??S!!!Q&!!!! s###q(###$ ;"#9:::T\8JJt7IJJFKKMMMK%%c6222&(+++rB   c                 T   | j                             d          }|E|r$| j        |         j                                        S | j        |                                         S |r#d | j                                        D             S d | j                                        D             S )z
        Return the LLVM IR for this kernel.

        :param signature: A tuple of argument types.
        :return: The LLVM IR for the given signature, or a dict of LLVM IR
                 for all previously-encountered signatures.

        ro   Nc                 H    i | ]\  }}||j                                          S r   )rR   r   r  s      r@   r  z/CUDADispatcher.inspect_llvm.<locals>.<dictcomp>  sC     E E E)C X-::<< E E ErB   c                 >    i | ]\  }}||                                 S r   )r   r  s      r@   r  z/CUDADispatcher.inspect_llvm.<locals>.<dictcomp>  s@     E E E)C X2244 E E ErB   )rs  r  rU  rR   r   r   r  )rl   rc   ro   s      r@   r   zCUDADispatcher.inspect_llvm  s     #''11  @~i08EEGGG~i0==??? EE E-1^-A-A-C-CE E E EE E-1^-A-A-C-CE E E ErB   c                    t                      j        | j                            d          }|G|r%| j        |         j                                      S | j        |                                       S |r%fd| j                                        D             S fd| j                                        D             S )a+  
        Return this kernel's PTX assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The PTX code for the given signature, or a dict of PTX codes
                 for all previously-encountered signatures.
        ro   Nc                 L    i | ] \  }}||j                                       !S r   )rR   r<   r=   r  r  r8   s      r@   r  z.CUDADispatcher.inspect_asm.<locals>.<dictcomp>  sF     E E E)C X-99"== E E ErB   c                 B    i | ]\  }}||                               S r   )r   r  s      r@   r  z.CUDADispatcher.inspect_asm.<locals>.<dictcomp>  sC     E E E)C X11"55 E E ErB   )	r   rK   rs  r  rU  rR   r<   r   r  )rl   rc   ro   r8   s      @r@   r   zCUDADispatcher.inspect_asm  s      !!4#''11  A~i08DDRHHH~i0<<R@@@ EE E E E-1^-A-A-C-CE E E EE E E E-1^-A-A-C-CE E E ErB   c                     | j                             d          rt          d          || j        |                                         S d | j                                        D             S )a  
        Return this kernel's SASS assembly code for for the device in the
        current context.

        :param signature: A tuple of argument types.
        :return: The SASS code for the given signature, or a dict of SASS codes
                 for all previously-encountered signatures.

        SASS for the device in the current context is returned.

        Requires nvdisasm to be available on the PATH.
        ro   z(Cannot inspect SASS of a device functionNc                 >    i | ]\  }}||                                 S r   )r   )r=   r  defns      r@   r  z/CUDADispatcher.inspect_sass.<locals>.<dictcomp>  s:     = = =!T **,, = = =rB   )rs  r  rC   rU  r   r  r  s     r@   r   zCUDADispatcher.inspect_sass  s{     !!(++ 	KIJJJ >),99;;;= =%)^%9%9%;%;= = = =rB   c                     |t           j        }| j                                        D ]\  }}|                    |           dS )r   Nr   )r   r   rU  r  r   )rl   r   rY  r  s       r@   r   zCUDADispatcher.inspect_types  sU     <:D~++-- 	* 	*GAtD))))	* 	*rB   c                      | ||          }|S )r   r   )r   rH   rs  r   s       r@   r   zCUDADispatcher._rebuild  s    
 3w..rB   c                 8    t          | j        | j                  S )zd
        Reduce the instance for serialization.
        Compiled definitions are discarded.
        )rH   rs  )r   rH   rs  r   s    r@   r   zCUDADispatcher._reduce_states   s%    
 DL"&"46 6 6 	6rB   r9  )r   r   r   r~   ))r:  r;  r<  r=  
_fold_argsr   targetdescrr   rE   r>  ry  r|  r   	lru_cacher  r  r  rJ   rM  rc  r  r  rK  rJ  r  r  r  r  r  r  r  r  r  r   r   r   r   r?  r   r   r@  rA  s   @r@   rq  rq  :  s         JK>J " " " " " "  / / X/. . . Y%%%P P P &%P% % %
Q Q Q Q, 4 4 X4(4 4 4	B 	B 	B- - -    0 ! ! X!A A A A&A A A A(A A A A&A A A A*A A A A&/ / /6% % % %N* * *
" " "HE E E E.E E E E0= = = =,
* 
* 
* 
*   [6 6 6 6 6 6 6rB   rq  )9numpyr  rY   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr	   r
   numba.core.compiler_lockr   numba.core.dispatcherr   numba.core.errorsr   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudarx  numbar   r   warningsr   rV   ReduceMixinr.   objectrC  r[  rf  rn  rq  r   rB   r@   <module>r     s       				 



      H H H H H H H H H H H H H H H H / / / / / / / / 9 9 9 9 9 9 , , , , , , 5 5 5 5 5 5 4 4 4 4 4 4 4 4 - - - - - - $ $ $ $ $ $ : : : : : : : : % % % % % % 2 2 2 2 2 2 - - - - - -< < < < < < < < * * * * * *                  * * * k/ k/ k/ k/ k/i# k/ k/ k/\+ + + + +V + + +\A A A A A A A A:    I   $               L6 L6 L6 L6 L6Z!6 L6 L6 L6 L6 L6rB   