
    ܙd                     )   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z"  e
            Z#e#j$        Z$e#j%        Z&e#j'        Z'd Z( e& ej)        e          d          d             Z* e& ej)        e          d          d             Z+ e& ej)        e          d          d             Z, e& ej)        e          d          d             Z- e& ej)        e          d          d             Z. e&e d          d             Z/ e&e d          d             Z0 e&e d          d             Z1 e$ej2        j3                  d              Z4 e$d!e!          d"             Z5 e$ej6        j7        ej8                  d#             Z9d a:d$ Z; e$ej<        j=        ej>        ej?                  d%             Z@ e$ej<        j=        ejA        ej?                   e$ej<        j=        ejB        ej?                  d&                         ZC e$ejD        j=        ej>        ej?                  d'             ZE e$ejD        j=        ejA        ej?                   e$ejD        j=        ejB        ej?                  d(                         ZF e$ejG                  d)             ZH e$ejI                  d*             ZJ e$ejK                  d+             ZL e$ejM                  d,             ZN e$ejM        ejO                  d-             ZP e$ejQ        ejO        ejO        ejO        ejO        ejO                   e$ejQ        ejO        ejO        ejR        ejO        ejO                   e$ejQ        ejO        ejO        ejS        ejO        ejO                   e$ejQ        ejO        ejO        ejT        ejO        ejO                  d.                                                 ZU e$ejV        ejO        ejO        ejW                  d/             ZX e$ejY        ejO        ejO                   e$ejY        ejO        ejR                   e$ejY        ejO        ejS                   e$ejY        ejO        ejT                  d0                                                 ZZ e$ej[        ejO        ejO                   e$ej[        ejO        ejR                   e$ej[        ejO        ejS                   e$ej[        ejO        ejT                  d1                                                 Z\ e$ej]                  d2             Z^ e$ej_                  d3             Z` e$eja        ej?                  d4             Zb e$ejc        ej?        ej?        ej?                  d5             Zdd6 Ze eejf        ejg                  d7             Zh eejg        ejf                  d8             Zid9 Zj eejf        ejk                  d:             Zl eejk        ejf                   eej>        ejf                  d;                         Zmd< Zn enejo        jp        d=            enejq        d=            enejr        d=            enejo        js        d>            enejt        d>            eneju        d>            enejo        jv        d?            enejw        d?            enejx        d?            e$ejo        jy        ejf                  d@             Zz e$ej{        ejf                  dA             Z| e$ejo        j}        ejf                  dB             Z~ e$eejf                  dC             Z e$ejo        j        ejf        ejf        ejf                  dD             Z e$ej        ejf        ejf                   e$ej        ejf        ejf                  dE                         ZdFZdG Z  e$ejo        j        ejf        ejf                   edH                       e$ej        ejf        ejf                   edH                       e$ejo        j        ejf        ejf                   edI                       e$ej        ejf        ejf                   edI                       e$ejo        j        ejf        ejf                   edJ                       e$ej        ejf        ejf                   edJ                       e$ejo        j        ejf        ejf                   edK                       e$ej        ejf        ejf                   edK                       e$ejo        j        ejf        ejf                   edL                       e$ej        ejf        ejf                   edL                       e$ejo        j        ejf        ejf                   edM                       e$ej        ejf        ejf                   edM                     dN Z eejo        j        dOdK            eejo        j        dPdM           ej        dQej        dRiZ e$ej        ej                   e$ej        ej                  dS                         Z e$ej        ej                  dT             Z e$ej        ej                  dU             Z e$ej        ej?                  dV             Z e$ej        ejO                   e$ej        ej                  dW                         Z e$ej        ejR                   e$ej        ej                  dX                         Z e$ej        ej?        ej?        ej?                  dY             Z e$eejS        ejS                  dZ             Z e$eejT        ejS                   e$eejS        ejT                   e$eejT        ejT                  d[                                     Z e$eejS        ejS                  d\             Z e$eejT        ejS                   e$eejS        ejT                   e$eejT        ejT                  d]                                     Z e$eejS                   e$eejT                  d^                         Z e$eejS        ejk                   e$eejT        ejk                  d_                         Zd` Zej        daz  Zdaej        z  Z  e$ej        ejS                   ee                       e$ej        ejT                   ee                       e$ej        ejS                   ee                       e$ej        ejT                   ee                     db Zdc Z e$ej        jq        ej8        ej        ej?                   e$ej        jq        ej8        ejB        ej?                   e$ej        jq        ej8        ejA        ej?                  edd                                                 Z e$ej        jt        ej8        ej        ej?                   e$ej        jt        ej8        ejB        ej?                   e$ej        jt        ej8        ejA        ej?                  ede                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejB        ej?                   e$ej        j        ej8        ejA        ej?                  edf                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejB        ej?                   e$ej        j        ej8        ejA        ej?                  edg                                                 Zdh Z eej        j        di            eej        j        dj            eej        j        dk            e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejB        ej?                   e$ej        j        ej8        ejA        ej?                  edl                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejA        ej?                   e$ej        j        ej8        ejB        ej?                  edm                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejA        ej?                   e$ej        j        ej8        ejB        ej?                  edn                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejA        ej?                   e$ej        j        ej8        ejB        ej?                  edo                                                 Z e$ej        j        ej8        ej        ej?                   e$ej        j        ej8        ejA        ej?                   e$ej        j        ej8        ejB        ej?                  edp                                                 Z e$ej        j        ej8        ej?        ej?                  dq             Z e$ej        j        ej8        ej        ej?        ej?                   e$ej        j        ej8        ejA        ej?        ej?                   e$ej        j        ej8        ejB        ej?        ej?                  dr                                     Z e$ej        ej                  ds             Z	 dwduZ e'e"          dv             Z e ej                    e$           dS )x    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3
grid_groupCUDADispatcherc                     t          j        | d|z            }t          j        | d|z            }t          j        | d|z            }t          j        | |||f          S )Nz%s.xz%s.yz%s.z)r   	call_sregr
   pack_struct)builderprefixxyzs        3lib/python3.11/site-packages/numba/cuda/cudaimpl.pyinitialize_dim3r      s]    GVf_55AGVf_55AGVf_55AwAq	222    	threadIdxc                 "    t          |d          S )Ntidr   contextr   sigargss       r   cuda_threadIdxr)       s    7E***r    blockDimc                 "    t          |d          S )Nntidr$   r%   s       r   cuda_blockDimr-   %   s    7F+++r    blockIdxc                 "    t          |d          S )Nctaidr$   r%   s       r   cuda_blockIdxr1   *   s    7G,,,r    gridDimc                 "    t          |d          S )Nnctaidr$   r%   s       r   cuda_gridDimr5   /   s    7H---r    laneidc                 ,    t          j        |d          S )Nr6   )r   r   r%   s       r   cuda_laneidr8   4   s    w111r    r   c                 .    |                     |d          S Nr   extract_valuer%   s       r   dim3_xr=   9         q)))r    r   c                 .    |                     |d          S Nr   r;   r%   s       r   dim3_yrA   >   r>   r    r   c                 .    |                     |d          S )N   r;   r%   s       r   dim3_zrD   C   r>   r    c                     |                      t          j        d          }|j        }|                    t          j        |          |f          S r@   )get_constantr	   int32modulecallr    declare_cudaCGGetIntrinsicHandle)r&   r   r'   r(   onelmods         r   cg_this_gridrM   H   sI    


u{A
.
.C>D<<2488	  r    zGridGroup.syncc                     |                      t          j        d          }|j        }|                    t          j        |          g ||R           S r:   )rF   r	   rG   rH   rI   r   declare_cudaCGSynchronize)r&   r   r'   r(   flagsrL   s         r   ptx_sync_grouprQ   Q   sQ      a00E>D<<+D11$  r    c                     |d         S r:    r%   s       r   cuda_const_array_likerT   \   s     7Nr    c                 L    t           dz  a d                    | t                     S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnames    r   _get_unique_smem_idrZ   f   s$     qOD/222r    c           	          |j         d         j        }t          |j         d                   }t          | ||f|t	          d          t
          j        d          S )Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsized)r(   literal_valuer   _generic_arrayrZ   r   ADDRSPACE_SHAREDr&   r   r'   r(   lengthr_   s         r   cuda_shared_array_integerrh   p   sX    Xa[&F$$E'76)5&9.&I&I$($9'+- - - -r    c           	          d |j         d         D             }t          |j         d                   }t          | |||t          d          t          j        d          S )Nc                     g | ]	}|j         
S rS   rc   .0ss     r   
<listcomp>z+cuda_shared_array_tuple.<locals>.<listcomp>}       444!ao444r    r   r   r\   Tr]   )r(   r   rd   rZ   r   re   r&   r   r'   r(   r^   r_   s         r   cuda_shared_array_tuplerr   z   sc     54sx{444E$$E'7%u&9.&I&I$($9'+- - - -r    c           	          |j         d         j        }t          |j         d                   }t          | ||f|dt          j        d          S )Nr   r   _cudapy_lmemFr]   )r(   rc   r   rd   r   ADDRSPACE_LOCALrf   s         r   cuda_local_array_integerrv      sP    Xa[&F$$E'76)5&4$($8',. . . .r    c           	          d |j         d         D             }t          |j         d                   }t          | |||dt          j        d          S )Nc                     g | ]	}|j         
S rS   rk   rl   s     r   ro   z(ptx_lmem_alloc_array.<locals>.<listcomp>   rp   r    r   r   rt   Fr]   )r(   r   rd   r   ru   rq   s         r   ptx_lmem_alloc_arrayry      s[     54sx{444E$$E'7%u&4$($8',. . . .r    c                     |rJ d}|j         }t          j        t          j                    d          }t	          j        |||          }|                    |d           |                                 S )Nzllvm.nvvm.membar.ctarS   rH   r   FunctionTypeVoidTyper
   get_or_insert_functionrI   get_dummy_valuer&   r   r'   r(   fnamerL   fntysyncs           r   ptx_threadfence_blockr      h    OOO"E>D?2;=="--D)$e<<DLLr""$$$r    c                     |rJ d}|j         }t          j        t          j                    d          }t	          j        |||          }|                    |d           |                                 S )Nzllvm.nvvm.membar.sysrS   r{   r   s           r   ptx_threadfence_systemr      r   r    c                     |rJ d}|j         }t          j        t          j                    d          }t	          j        |||          }|                    |d           |                                 S )Nzllvm.nvvm.membar.glrS   r{   r   s           r   ptx_threadfence_devicer      sh    OOO!E>D?2;=="--D)$e<<DLLr""$$$r    c                     |                      t          j        d          }t          j        t          j                  }t	          | |||g          S )Nl    )rF   r	   rG   noneptx_syncwarp_mask)r&   r   r'   r(   maskmask_sigs         r   ptx_syncwarpr      s@    Z88Dz%+&&HWgx$@@@r    c                    d}|j         }t          j        t          j                    t          j        d          f          }t          j        |||          }|                    ||           |                                 S )Nzllvm.nvvm.bar.warp.sync    )	rH   r   r|   r}   IntTyper
   r~   rI   r   r   s           r   r   r      sj    %E>D?2;==2:b>>*;<<D)$e<<DLLt""$$$r    c           
      H   |\  }}}}}|j         d         }	|	t          j        v r-|                    |t	          j        |	j                            }d}
|j        }t	          j        t	          j	        t	          j        d          t	          j        d          f          t	          j        d          t	          j        d          t	          j        d          t	          j        d          t	          j        d          f          }t          j        |||
          }|	j        dk    r|                    ||||||f          }|	t          j        k    rj|                    |d          }|                    |d          }|                    |t	          j                              }t          j        |||f          }n|                    |t	          j        d                    }|                    ||                     t          j        d                    }|                    |t	          j        d                    }|                    ||||||f          }|                    ||||||f          }|                    |d          }|                    |d          }|                    |d          }|                    |t	          j        d                    }|                    |t	          j        d                    }|                    ||                     t          j        d                    }|                    ||          }|	t          j        k    r'|                    |t	          j                              }t          j        |||f          }|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    rC   zllvm.nvvm.shfl.sync.i32r   r   r   @   )r(   r	   real_domainbitcastr   r   bitwidthrH   r|   LiteralStructTyper
   r~   rI   float32r<   	FloatTypemake_anonymous_structtrunclshrrF   i8zextshlor_float64
DoubleType)r&   r   r'   r(   r   modevalueindexclamp
value_typer   rL   r   funcretrvpredfvvalue1
value_lshrvalue2ret1ret2rv1rv2rv1_64rv2_64rv_shls                               r   ptx_shfl_sync_i32r      s     '+#D$ue!JU&&&rz*2E'F'FGG%E>D?
bjnnbjmm<==Z^^RZ^^RZ^^Z^^RZ^^= D
 )$e<<Db  ll4$eUE!BCC&&&&sA..B((a00DR\^^44B/"dDDCubjnn55\\%)=)=eh)K)KLL
z2:b>>::||D4vue"DEE||D4vue"DEE##D!,,##D!,,$$T1--c2:b>>22c2:b>>22VW%9%9%(B%G%GHH[[((&&R]__55B+Gb$Z@@Jr    c                 x   d}|j         }t          j        t          j        t          j        d          t          j        d          f          t          j        d          t          j        d          t          j        d          f          }t          j        |||          }|                    ||          S )Nzllvm.nvvm.vote.syncr   r   )rH   r   r|   r   r   r
   r~   rI   )r&   r   r'   r(   r   rL   r   r   s           r   ptx_vote_syncr     s    !E>D?2/B13A1@ A AJrNNBJrNNBJqMMJL LD )$e<<D<<d###r    c                    |\  }}|j         d         j        }|j         d         t          j        v r(|                    |t          j        |                    }d                    |          }|j        }t          j	        t          j        d          t          j        d          t          j        |          f          }	t          j        ||	|          }
|                    |
||f          S )Nr   zllvm.nvvm.match.any.sync.i{}r   )r(   r   r	   r   r   r   r   rW   rH   r|   r
   r~   rI   r&   r   r'   r(   r   r   widthr   rL   r   r   s              r   ptx_match_any_syncr     s    
 KD%HQK E
x{e'''rz%'8'899*11%88E>D?2:b>>BJrNNBJu<M<M+NOOD)$e<<D<<tUm,,,r    c                 ,   |\  }}|j         d         j        }|j         d         t          j        v r(|                    |t          j        |                    }d                    |          }|j        }t          j	        t          j
        t          j        d          t          j        d          f          t          j        d          t          j        |          f          }	t          j        ||	|          }
|                    |
||f          S )Nr   zllvm.nvvm.match.all.sync.i{}r   )r(   r   r	   r   r   r   r   rW   rH   r|   r   r
   r~   rI   r   s              r   ptx_match_all_syncr     s    
 KD%HQK E
x{e'''rz%'8'899*11%88E>D?2/B13A1@ A AJrNNBJu,=,=>@ @D )$e<<D<<tUm,,,r    c                     t          j        t          j        t          j        d          g           ddd          }|                    |g           S )Nr   zactivemask.b32 $0;=rTside_effectr   	InlineAsmr|   r   rI   r&   r   r'   r(   
activemasks        r   ptx_activemaskr   /  sL    bobjnnbAA2DdL L LJ<<
B'''r    c                     t          j        t          j        t          j        d          g           ddd          }|                    |g           S )Nr   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   s        r   ptx_lanemask_ltr   6  sL    bobjnnbAA94*.0 0 0J <<
B'''r    c                 8    |                     |d                   S r:   )ctpopr%   s       r   ptx_popcr   >  s    ==a!!!r    c                      |j         | S N)fmar%   s       r   ptx_fmar   C  s    7;r    c                 n    ddd}	 ||          S # t           $ r d|  d}t          j        |          w xY w)N)f32f)f64d)r   r   z$Conversion between float16 and float unsupportedKeyErrorr   CudaLoweringErrorr   typemapmsgs      r   float16_float_ty_constraintr   H  s[    \22G,x   , , ,KXKKK&s+++,s    %4c                 4   |j         |j         k    r|S t          |j                   \  }}t          j        |                     |          t          j        d          g          }t          j        |d| dd| d          }|                    ||g          S )N   zcvt..f16 $0, $1;=,h)r   r   r   r|   get_value_typer   r   rI   	r&   r   fromtytotyvalty
constraintr   asms	            r   float16_to_float_castr   R  s    $-''
0??NB
?711$77"*R..9IJJD
,t4B4446H*6H6H6H
I
IC<<cU###r    c                 2   |j         |j         k    r|S t          |j                   \  }}t          j        t          j        d          |                     |          g          }t          j        |d| dd|           }|                    ||g          S )Nr   cvt.rn.f16. $0, $1;=h,)r   r   r   r|   r   r   r   rI   r   s	            r   float_to_float16_castr   ^  s    $-''
0AANB
?2:b>>G,B,B6,J,J+KLLD
,t727779Kz9K9K
L
LC<<cU###r    c                 r    ddddd}	 ||          S # t           $ r d|  d}t          j        |          w xY w)Nchrl)   r   r   r   z"Conversion between float16 and intr   r   r   s      r   float16_int_constraintr   j  s_    CSc33G,x   , , ,I8III&s+++,s    %6c                 (   |j         }t          |          }|j        rdnd}t          j        |                     |          t          j        d          g          }t          j        |d| | dd| d          }	|                    |	|g          S )Nrn   ur   zcvt.rni.r   r   r   )	r   r   signedr   r|   r   r   r   rI   
r&   r   r   r   r   r   r   
signednessr   r   s
             r   float16_to_integer_castr   t  s    }H'11J,J?711$77"*R..9IJJD
,tD*DhDDD):)))+ +C <<cU###r    c                 &   |j         }t          |          }|j        rdnd}t          j        t          j        d          |                     |          g          }t          j        |d| | dd|           }	|                    |	|g          S )Nrn   r   r   r   r   r   )	r   r   r   r   r|   r   r   r   rI   r   s
             r   integer_to_float16_castr    s     H'11J.3J?2:b>>#226::;= =D
,tCZCCCC)Z))+ +C <<cU###r    c                 h    t          | t          j        t          j                  fd            }d S )Nc                     t          j        t          j        d          t          j        d          t          j        d          g          }t          j        | dd          }|                    ||          S )Nr   z.f16 $0,$1,$2;=h,h,hr   r|   r   r   rI   )r&   r   r'   r(   r   r   ops         r   ptx_fp16_binaryz*lower_fp16_binary.<locals>.ptx_fp16_binary  sh    rz"~~ "
2
2?A Al4B!6!6!6AA||C&&&r    lowerr	   float16)fnr  r  s    ` r   lower_fp16_binaryr    sA    
2u}em,,' ' ' ' -,' ' 'r    addsubmulc                     t          j        t          j        d          t          j        d          g          }t          j        |dd          }|                    ||          S )Nr   zneg.f16 $0, $1;=h,hr  r&   r   r'   r(   r   r   s         r   ptx_fp16_hnegr    N    ?2:b>>BJrNN+;<<D
,t.
7
7C<<T"""r    c                 &    t          | |||          S r   )r  r%   s       r   operator_hnegr        '3555r    c                     t          j        t          j        d          t          j        d          g          }t          j        |dd          }|                    ||          S )Nr   zabs.f16 $0, $1;r  r  r  s         r   ptx_fp16_habsr    r  r    c                 &    t          | |||          S r   )r  r%   s       r   operator_habsr    r  r    c                    t          j        d          t          j        d          t          j        d          g}t          j        t          j        d          |          }t          j        |dd          }|                    ||          S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   r   r|   r   rI   )r&   r   r'   r(   argtysr   r   s          r   ptx_hfmar    sg    jnnbjnnbjnn=F?2:b>>622D
,t6

C
CC<<T"""r    c                 8    d }|                      ||||          S )Nc                 B    t           j                            | |          S r   )r   fp16hdiv)r   r   s     r   fp16_divzfp16_div_impl.<locals>.fp16_div  s    y~~a###r    compile_internal)r&   r   r'   r(   r#  s        r   fp16_div_implr&    s-    $ $ $ ##GXsDAAAr    z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                       fd}|S )Nc                    t          j        t          j        d          t          j        d          t          j        d          g          }t          j        |t                              	          d          }|                    ||          }|                     t          j	        d          }|
                    |t          j        d                    }|                    d||          S )Nr   )r  r  r   z!=)r   r|   r   r   	_fp16_cmprW   rI   rF   r	   int16r   icmp_unsigned)
r&   r   r'   r(   r   r   resultzero
int_resultr  s
            r   ptx_fp16_comparisonz*_gen_fp16_cmp.<locals>.ptx_fp16_comparison  s    rz"~~
2
2/OPPl4!1!1R!1!8!8(CCc4((##EK33__VRZ^^<<
$$T:t<<<r    rS   )r  r/  s   ` r   _gen_fp16_cmpr0    s$    = = = = = r    eqnegegtleltc                 h    t          | t          j        t          j                  fd            }d S )Nc                      t                    | |||          }|                    ||d         |d                   S )Nr   r   )r0  select)r&   r   r'   r(   choicer  s        r   ptx_fp16_minmaxz*lower_fp16_minmax.<locals>.ptx_fp16_minmax  s>    "r""7GS$??~~fd1gtAw777r    r  )r  r   r  r;  s     ` r   lower_fp16_minmaxr<    sA    
2u}em,,8 8 8 8 -,8 8 8r    maxmin
__nv_cbrtf	__nv_cbrtc                     |j         }t          |         }|                     |          }|j        }t	          j        ||g          }t          j        |||          }	|                    |	|          S r   )	return_type
cbrt_funcsr   rH   r   r|   r
   r~   rI   )
r&   r   r'   r(   r   r   ftyrL   r   r  s
             r   ptx_cbrtrE    si     
BrNE

 
 
$
$C>D?3&&D		'dE	:	:B<<D!!!r    c           	          t          j        |j        t          j        t          j        d          t          j        d          f          d          }|                    ||          S )Nr   	__nv_brevr
   r~   rH   r   r|   r   rI   r&   r   r'   r(   r  s        r   ptx_brev_u4rJ    sV    
 
	'

2B(9::
 
B <<D!!!r    c           	          t          j        |j        t          j        t          j        d          t          j        d          f          d          }|                    ||          S )Nr   __nv_brevllrH  rI  s        r   ptx_brev_u8rM    sV    
 
	'

2B(9::
 
B <<D!!!r    c                 v    |                     |d         |                     t          j        d                    S r:   )ctlzrF   r	   booleanr%   s       r   ptx_clzrQ  '  s4    <<QU]A..0 0 0r    c           	          t          j        |j        t          j        t          j        d          t          j        d          f          d          }|                    ||          S )Nr   __nv_ffsrH  rI  s        r   
ptx_ffs_32rT  .  sV     
	'

2B(9::
 
B <<D!!!r    c           	          t          j        |j        t          j        t          j        d          t          j        d          f          d          }|                    ||          S )Nr   r   
__nv_ffsllrH  rI  s        r   
ptx_ffs_64rW  8  sV     
	'

2B(9::
 
B <<D!!!r    c                 <    |\  }}}|                     |||          S r   )r9  )r&   r   r'   r(   testabs          r   ptx_selpr\  B  s#    JD!Q>>$1%%%r    c           	          t          j        |j        t          j        t          j                    t          j                    t          j                    f          d          }|                    ||          S )N
__nv_fmaxfr
   r~   rH   r   r|   r   rI   rI  s        r   
ptx_max_f4r`  H  `    		'
LNN\^^R\^^,	. 	. 	
 
B <<D!!!r    c           
         t          j        |j        t          j        t          j                    t          j                    t          j                    f          d          }|                    ||                     ||d         |j        d         t          j
                  |                     ||d         |j        d         t          j
                  g          S )N	__nv_fmaxr   r   r
   r~   rH   r   r|   r   rI   castr(   r	   doublerI  s        r   
ptx_max_f8rg  S       
	'
MOO]__bmoo.	0 	0 	
 
B <<Wd1gsx{ELAAWd1gsx{ELAA   r    c           	          t          j        |j        t          j        t          j                    t          j                    t          j                    f          d          }|                    ||          S )N
__nv_fminfr_  rI  s        r   
ptx_min_f4rk  d  ra  r    c           
         t          j        |j        t          j        t          j                    t          j                    t          j                    f          d          }|                    ||                     ||d         |j        d         t          j
                  |                     ||d         |j        d         t          j
                  g          S )N	__nv_fminr   r   rd  rI  s        r   
ptx_min_f8rn  o  rh  r    c           	      6   t          j        |j        t          j        t          j        d          t          j                    f          d          }|                    ||                     ||d         |j	        d         t          j                  g          S )Nr   __nv_llrintr   )r
   r~   rH   r   r|   r   r   rI   re  r(   r	   rf  rI  s        r   	ptx_roundrq    s     
	'
JrNN]__	  	  	
 
B <<Wd1gsx{ELAA   r    c                 8    d }|                      ||||          S )Nc                    t          j        |           st          j        |           r| S |dk    r7|dk    rd|dz
  z  }d}nd|z  }d}| |z  |z  }t          j        |          r| S nd| z  }| |z  }t          |          }t          j        ||z
            dk    rdt          |dz            z  }|dk    r	||z  |z  }n||z  }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitspow1pow2r   r   s         r   round_ndigitsz$round_to_impl.<locals>.round_ndigits  s    :a== 	DJqMM 	Ha<<|| "-wTT!Az!}}  WH%DDA!HHIa!e##eAGnn$Aa<<TT!AAIAr    r$  )r&   r   r'   r(   r}  s        r   round_to_implr~    s.      B ##G]CHHHr    c                       fd}|S )Nc                 z    |j         \  }|                     |          }|                    ||d                   S r:   )r(   rF   fmul)r&   r   r'   r(   argtyfactorconsts         r   implzgen_deg_rad.<locals>.impl  s9    %%eU33||FDG,,,r    rS   )r  r  s   ` r   gen_deg_radr    s#    - - - - - Kr    g     f@c                     |t           j        v rt          j        |d          }|g}n$t          j        |t          |                    } fdt          ||          D             }|j        }||k    rt          d|d|          |j	        t          |          k    r&t          d|j	        t          |          fz            ||fS )z4
    Convert integer indices into tuple of intp
    r   )r_   count)r  c                 Z    g | ]'\  }}                     ||t          j                  (S rS   )re  r	   intp)rm   tir   r&   s      r   ro   z&_normalize_indices.<locals>.<listcomp>  sA     0 0 01a ||GQ5:66 0 0 0r    zexpect z	 but got z#indexing %d-D array with %d-D index)
r	   integer_domainUniTupler
   unpack_tuplelenzipr_   	TypeErrorndim)r&   r   indtyindsarytyvaltyindicesr_   s   ``      r   _normalize_indicesr    s     $$$U!444&&wCJJGGG0 0 0 0 0ug..0 0 0G KE~~i%%%?@@@zSZZ=SZZ01 2 2 	2 '>r    c                       fd}|S )Nc                     |j         \  }}}|\  }}}	|j        }
t          | |||||          \  }} |                     |          | ||          }t	          j        | ||||d          } | ||
||	          S )NT
wraparound)r(   r_   r  
make_arrayr
   get_item_pointer)r&   r   r'   r(   r  r  r  aryr  r   r_   r  laryptrdispatch_fns                 r   impz_atomic_dispatcher.<locals>.imp  s    !hueT3+GWeT,15: :w )w!!%(('3??&wg268 8 8 {7GUC===r    rS   )r  r  s   ` r   _atomic_dispatcherr    s#    > > > > > Jr    c                 6   |t           j        k    r1|j        }|                    t	          j        |          ||f          S |t           j        k    r1|j        }|                    t	          j        |          ||f          S |                    d||d          S )Nr  	monotonic)	r	   r   rH   rI   r   declare_atomic_add_float32r   declare_atomic_add_float64
atomic_rmwr&   r   r_   r  r   rL   s         r   ptx_atomic_add_tupler        
 ~||I@FF #J( ( 	(	%-		~||I@FF #J( ( 	( !!%c;???r    c                 6   |t           j        k    r1|j        }|                    t	          j        |          ||f          S |t           j        k    r1|j        }|                    t	          j        |          ||f          S |                    d||d          S )Nr  r  )	r	   r   rH   rI   r   declare_atomic_sub_float32r   declare_atomic_sub_float64r  r  s         r   ptx_atomic_subr    r  r    c                     |t           j        j        v rG|j        }|j        }t          t          d|           }|                     ||          ||f          S t          d| d          )Ndeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclunsigned_int_numba_typesr   rH   getattrr   rI   r  r&   r   r_   r  r   bwrL   r  s           r   ptx_atomic_incr    t    
 666^~Y = = =>>||BBtHHsCj111FFFFGGGr    c                     |t           j        j        v rG|j        }|j        }t          t          d|           }|                     ||          ||f          S t          d| d          )Ndeclare_atomic_dec_intzUnimplemented atomic dec with r  r  r  s           r   ptx_atomic_decr  "  r  r    c                     t           fd            }t          j        t          j        t          j        fD ]1} t          | t          j        |t          j                  |           2d S )Nc                     |t           j        j        v r|                    ||d          S t	          d d| d          )Nr  zUnimplemented atomic z with r  r   r  integer_numba_typesr  r  )r&   r   r_   r  r   r  s        r   impl_ptx_atomicz+ptx_atomic_bitwise.<locals>.impl_ptx_atomic1  sO    T]677%%b#sK@@@KBKKeKKKLLLr    )r  r	   r  r  Tupler	  ArrayAny)stubr  r  r   s    `  r   ptx_atomic_bitwiser  0  s}    M M M M M z5>5;7 A A/dEKUY//@@@@A Ar    andorxorc                 ~    |t           j        j        v r|                    d||d          S t	          d| d          )Nxchgr  zUnimplemented atomic exch with r  r  )r&   r   r_   r  r   s        r   ptx_atomic_exchr  A  sH    
 233!!&#sK@@@G%GGGHHHr    c                    |j         }|t          j        k    r*|                    t	          j        |          ||f          S |t          j        k    r*|                    t	          j        |          ||f          S |t          j        t          j	        fv r|
                    d||d          S |t          j        t          j        fv r|
                    d||d          S t          d|z            Nr=  r  orderingumaxz&Unimplemented atomic max with %s array)rH   r	   r   rI   r   declare_atomic_max_float64r   declare_atomic_max_float32rG   int64r  uint32uint64r  r  s         r   ptx_atomic_maxr  L      
 >D||I@FF #J( ( 	(	%-		||I@FF #J( ( 	(	5;,	,	,!!%cK!HHH	5<.	.	.!!&#s[!III@5HIIIr    c                    |j         }|t          j        k    r*|                    t	          j        |          ||f          S |t          j        k    r*|                    t	          j        |          ||f          S |t          j        t          j	        fv r|
                    d||d          S |t          j        t          j        fv r|
                    d||d          S t          d|z            Nr>  r  r  uminz&Unimplemented atomic min with %s array)rH   r	   r   rI   r   declare_atomic_min_float64r   declare_atomic_min_float32rG   r  r  r  r  r  r  s         r   ptx_atomic_minr  `  r  r    c                    |j         }|t          j        k    r*|                    t	          j        |          ||f          S |t          j        k    r*|                    t	          j        |          ||f          S |t          j        t          j	        fv r|
                    d||d          S |t          j        t          j        fv r|
                    d||d          S t          d|z            r  )rH   r	   r   rI   r   declare_atomic_nanmax_float64r   declare_atomic_nanmax_float32rG   r  r  r  r  r  r  s         r   ptx_atomic_nanmaxr  t      
 >D||ICDII #J( ( 	(	%-		||ICDII #J( ( 	(	5;,	,	,!!%cK!HHH	5<.	.	.!!&#s[!III@5HIIIr    c                    |j         }|t          j        k    r*|                    t	          j        |          ||f          S |t          j        k    r*|                    t	          j        |          ||f          S |t          j        t          j	        fv r|
                    d||d          S |t          j        t          j        fv r|
                    d||d          S t          d|z            r  )rH   r	   r   rI   r   declare_atomic_nanmin_float64r   declare_atomic_nanmin_float32rG   r  r  r  r  r  r  s         r   ptx_atomic_nanminr    r  r    c                    |                     |j        d         t          j        |j        d         |j        d                   }|d         |                     t          j        d          |d         |d         f}t          | |||          S )Nr   r   rC   )rB  r(   r	   r  rF   ptx_atomic_casr%   s       r   ptx_atomic_compare_and_swapr    sm    
//#(1+uz38A;
L
LCGW))%*a88$q'47KD'7C666r    c                    |j         \  }}}}|\  }}	}
}t          | |||	||          \  }} |                     |          | ||          }t          j        | ||||d          }|j        t          j        j        v r,|j	        }|j        j
        }t          j        |||||
|          S t          d|j        z            )NTr  z&Unimplemented atomic cas with %s array)r(   r  r  r
   r  r_   r   r  r  rH   r   r   atomic_cmpxchgr  )r&   r   r'   r(   r  r  oldtyr  r  r  oldr   r  r  r  rL   r   s                    r   r  r    s     "%E5%CsC'%u(-/ /NE7 %7e$$Wgs;;D

"7GUD'.24 4 4C {t}899~;''xc3OOO@5;NOOOr    c                     t          j        t          j        t          j                    t          j        d          g          ddd          }|d         }|                    ||g           d S )Nr   znanosleep.u32 $0;r   Tr   r   )r   r   r|   r}   r   rI   )r&   r   r'   r(   	nanosleepnss         r   ptx_nanosleepr    sf    R_R[]]RZ^^<LMM0#4I I II	aBLLRD!!!!!r    Fc           
          t          t          j        |d          }|dk    o|ot          |          dk    }|dk    r|st	          d           j        |         }	t          |t          j        t          j	        f          p)t          |	t          j                  p|t          j        k    }
|t          j        vr|
st          d|z                                 |          }t!          j        ||          }|t$          j        k    rt)          j        |||          }n|j        }t)          j        ||||          }                     |          }d|dz
                                  z  |_        |rd|_        n$t!          j        |t           j                  |_        |                    |t!          j         t!          j!        d                    d          }tE          j#        t%          j$                    j%                  }                     |          }|&                    |          }|}g }tO          tQ          |                    D ]\  }}|)                    |           ||z  } d	 tQ          |          D             } fd
|D             }|rt!          j*        t!          j+        t!          j!        d          g           ddd          }|,                    |-                    |g           t!          j!        d                    } .                    t          j/        |          }|0                    ||          g}n fd|D             }t          |          }t          j1        ||d          }  2                    |           |          } 3                    ||4                    ||j5        j6                  || .                    t          j/        |          d            |7                                S )Nr   r   zarray length <= 0zunsupported type: %srX   externalr   genericc                     g | ]}|S rS   rS   rl   s     r   ro   z"_generic_array.<locals>.<listcomp>  s    ---Qq---r    c                 P    g | ]"}                     t          j        |          #S rS   rF   r	   r  rm   rn   r&   s     r   ro   z"_generic_array.<locals>.<listcomp>  s+    EEE$$UZ33EEEr    r   zmov.u32 $0, %dynamic_smem_size;r   Tr   r   c                 P    g | ]"}                     t          j        |          #S rS   r  r  s     r   ro   z"_generic_array.<locals>.<listcomp>  s+    EEE!'&&uz155EEEr    C)r_   r  layout)datar^   stridesitemsizememinfo)8r   operatorr  r  
ValueErrordata_model_manager
isinstancer	   RecordBooleanr   StructModelr
  number_domainr  get_data_typer   	ArrayTyper   ru   r
   alloca_oncerH   add_global_variableget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedinitializeraddrspacecastPointerTyper   llcreate_target_dataNVVMdata_layoutget_abi_size	enumeratereversedappendr   r|   r   rI   rF   r  udivr  r  populate_arrayr   r   type	_getvalue) r&   r   r^   r_   r`   ra   rb   	elemcountdynamic_smem
data_modelother_supported_typelldtypelarytydataptrrL   gvmemr  
targetdatar  
laststriderstridesr  lastsizer  kstridesget_dynshared_sizedynsmem_size	kitemsizekshaper  r  r  s    `                               r   rd   rd     s   x|UA..I >FlFs5zzQLA~~l~,--- +E2J55<788 	"j&"455	"EM! 
 E'''0D'.6777##E**G\'9--FD((( %gvKHHH~ +D&+,57 7 &&w// EAI22444 		B&EMM !#FBL A AE ''r~bjmm/L/L(13 3 &ty{{'>??J##E**G##J//H JH %11  8
###h

--(8,,---GEEEEWEEEH  F
  \"/"*R.."*M*M*K*.DB B B ||GLL1CR$H$H$&JrNN4 4 ((X>>	,,|Y778EEEEuEEE u::DKe$s;;;E
#'

U
#
#GW
5
5C3 ' G G!'#+$+$8$8X$N$N#'  ) ) ) ==??r    c                 *    |                                  S r   )r   )r&   r   r   pyvals       r   cuda_dispatcher_constr8  *  s    ""$$$r    )F)	functoolsr   r  ru  llvmliter   llvmlite.bindingbindingr  numba.core.imputilsr   r   numba.core.typing.npydeclr   numba.core.datamodelr   
numba.corer	   r
   numba.npr   numba.np.npyimplr   cudadrvr   numbar   
numba.cudar   r   r   numba.cuda.typesr   r   r   registryr	  lower_getattr
lower_attrlower_constantr   Moduler)   r-   r1   r5   r8   r=   rA   rD   cg	this_gridrM   rQ   r  
array_liker  rT   rV   rZ   sharedarrayIntegerLiteralr  rh   r  r  rr   localrv   ry   threadfence_blockr   threadfence_systemr   threadfencer   syncwarpr   i4r   shfl_sync_intrinsicr   f4f8r   vote_sync_intrinsicrP  r   match_any_syncr   match_all_syncr   r   r   lanemask_ltr   popcr   r   r   r   r
  Floatr   r   r   Integerr   r  r  r!  haddr  iaddhsubr  isubhmulr  imulhnegr  negr  habsr  absr  hfmar  truedivitruedivr&  r)  r0  heqr1  hner2  hger3  hgtr4  hler5  hltr6  r<  hmaxhminr   r   rC  cbrtrE  brevu4rJ  u8rM  clzrQ  ffsrT  rW  selpr\  r=  r`  rg  r>  rk  rn  rx  rq  r~  r  pi_deg2rad_rad2degradiansdegreesr  r  atomicr  r  r  incr  decr  r  and_r   r  exchr  r  r  nanmaxr  nanminr  compare_and_swapr  casr  r  r  r  rd   r8  
get_ufuncsrS   r    r   <module>r     s                       4 4 4 4 4 4 4 4 1 1 1 1 1 1 ' ' ' ' ' ' % % % % % % % %       , , , , , ,             / / / / / / / / / / = = = = = = = = = =8::#
(3 3 3 LEL,,+ + -,+ LEL
++, , ,+, LEL
++- - ,+- LEL	**. . +*. LEL))2 2 *)2 D#* * * D#* * * D#* * * tw   $$  %$ tzek**  +* 3 3 3 t{%.	::- - ;:- t{%+uy11t{%.%)44- - 54 21- tz-uy99. . :9. tzei00tz33. . 43 10. u% %  % u  % % ! % u% % % u~A A A u~ux  % % ! % u %(EHehx u %(EHehx u %(EHehx u %(EHehx + +    +\ u %(EHemDD$ $ ED$ uUXux00uUXux00uUXux00uUXux00	- 	- 10 10 10 10	- uUXux00uUXux00uUXux00uUXux00- - 10 10 10 10- u( ( ( u( ( ( uz59" " " uy%)UY	22  32, , , EM5;''$ $ ('$ EK''$ $ ('$, , , EM5=))	$ 	$ *)	$ EM5=))E %-00
$ 
$ 10 *)
$' ' '  %*/5 ) ) )  (, & & &  (- ' ' '  %*/5 ) ) )  (, & & &  (- ' ' '  %*/5 ) ) )  (, & & &  (- ' ' ' uz&&# # '&# x|U]##6 6 $#6 uz&&# # '&# sEM6 6 6 uzu}emDD# # ED# x66x%-77B B 87 76B		 	 	 4ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E 3ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E 3ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E 3ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E 3ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E 3ejnemU] 3 3MM$4G4G H H H 0hk5=%- 0 0t1D1D E E E8 8 8  %*/5$ / / /  %*/5$ / / / 
M<	M;
 uz5=!!uz5=!!" " "! "!" uz58" " " uz58" " " uy%)0 0 0 uy%(uy%(" "  " uy%(uy%(" "  " uz59ei33& & 43&
 sEHeh" "  " sEHehsEHehsEHeh        sEHeh" "  " sEHehsEHehsEHeh        uehueh	 	  	  ueh&&ueh&&"I "I '& '&"IJ   7T>$'> dlEH  kk(33 4 4 4 dlEH  kk(33 4 4 4 dlEH  kk(33 4 4 4 dlEH  kk(33 4 4 4  .  $ u|ej%)<<u|enei@@u|ek59==
@ 
@  >= A@ =<
@ u|ej%)<<u|enei@@u|ek59==
@ 
@  >= A@ =<
@ u|ej%)<<u|enei@@u|ek59==H H  >= A@ =<H u|ej%)<<u|enei@@u|ek59==H H  >= A@ =<H	A 	A 	A  5<$e , , ,  5<#T * * *  5<#U + + + u|%+uz59==u|%+u~uyAAu|%+u{EI>>I I  ?> BA >=I u|ej%)<<u|ek59==u|enei@@J J  A@ >= =<J  u|ej%)<<u|ek59==u|enei@@J J  A@ >= =<J  u|EKUY??u|EKei@@u|EKCCJ J  DC A@ @?J  u|EKUY??u|EKei@@u|EKCCJ J  DC A@ @?J  u|$ek59eiHH7 7 IH7 u|ej%)UYGGu|ek59eiHHu|eneiKKP P LK IH HGP* u%%" " &%" !&a a a aH % %  % ##%%u - - - - -r    