
    cW                        d Z ddlmZ ddlZddlZddlmZ ddlZddl	Z
ddlmZ ddlmZmZ ddlmZ ddlmZ  ej        e          Zd	 Zd9dZdej        dddfdZd Zd:dZd Zd;dZd;dZd;dZ  G d d          Z!d Z"d;dZ#e#Z$d;dZ%dej&        fdZ' G d d          Z( G d d          Z)d  Z*d! Z+d<d#Z, ed$ ej-        g e.%                    Z/ ed& ej-        g e.%                    Z0d=d(Z1d) Z2d* Z3d>d+Z4d>d,Z5d>d-Z6d. Z7d/ Z8d0 Z9	 dd1l:m;Z;m<Z<m=Z= n# e>$ r d2 Z;d3 Z<d4 Z=Y nw xY wd5 Z? G d6 d7          Z@	 dd8lAmBZB dS # e>$ r ejC        w xY w)?zMath helper functions.    )with_statementN)utils)entropy)get_blas_funcstriu)get_lapack_funcs)psic                 2    t          | f|f          d         S )a  Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`.

    Parameters
    ----------
    name : str
        Name(s) of BLAS functions, without the type prefix.
    ndarray : numpy.ndarray
        Arrays can be given to determine optimal prefix of BLAS routines.

    Returns
    -------
    object
        BLAS function for the needed operation on the given data type.

    r   )r   )namendarrays     /lib/python3.11/site-packages/gensim/matutils.pyblasr      s      4'G:..q11    Fc                 |   t          j        |           } || j        }|dk    rg S |r|  } || j        k    st          t           d          st          j        |           d|         S t          j        | |          d|         }|                    t          j        |                     |                              S )a  Efficiently calculate indices of the `topn` smallest elements in array `x`.

    Parameters
    ----------
    x : array_like
        Array to get the smallest element indices from.
    topn : int, optional
        Number of indices of the smallest (greatest) elements to be returned.
        If not given, indices of all elements will be returned in ascending (descending) order.
    reverse : bool, optional
        Return the `topn` greatest elements in descending order,
        instead of smallest elements in ascending order?

    Returns
    -------
    numpy.ndarray
        Array of `topn` indices that sort the array in the requested order.

    Nr   argpartition)npasarraysizehasattrargsortr   take)xtopnreversemost_extremes       r   r   r   /   s    ( 	
1A vqy 	 Bqv~ $WR88 $z!}}UdU##?1d++ETE2LRZ|(<(<==>>>r   c                    	 || j         }|| j        }|| j        }n# t          $ r Y nw xY w|rt                              d           |||ddg}}t          j        |ft          j                  }t          j        |f|          }	t          |           D ]q\  }
}|r%|
|z  dk    rt                              d|
|           |t          |          z   }|r	t          | ng g f\  |||<   |	||<   |                    |           |}r||k    s
J d            t          j                            |	||f||f|          }n%dg g dgf\  }}	}}t          |           D ]\  }
}|r$|
|z  dk    rt                              d|
           |r	t          | ng g f\  }}|                    |           |	                    |           |t          |          z  }|                    |           ||rt#          |          d	z   nd}t          |          d	z
  }t          j        |	|          }	t          j        |          }t          j                            |	||f||f|          }|S )
a*  Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`,
    with documents as columns.

    Notes
    -----
    If the number of terms, documents and non-zero elements is known, you can pass
    them here as parameters and a (much) more memory efficient code path will be taken.

    Parameters
    ----------
    corpus : iterable of iterable of (int, number)
        Input corpus in BoW format
    num_terms : int, optional
        Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
    dtype : data-type, optional
        Data type of output CSC matrix.
    num_docs : int, optional
        Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored.
    num_nnz : int, optional
        Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored.
    printprogress : int, optional
        Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging.

    Returns
    -------
    scipy.sparse.csc_matrix
        `corpus` converted into a sparse CSC matrix.

    See Also
    --------
    :class:`~gensim.matutils.Sparse2Corpus`
        Convert sparse format to Gensim corpus format.

    Nz"creating sparse matrix from corpusr   dtypezPROGRESS: at document #%i/%iz:mismatch between supplied and computed number of non-zeros)shaper   zPROGRESS: at document #%i   )	num_termsnum_docsnum_nnzAttributeErrorloggerinfor   emptyint32	enumeratelenzipappendscipysparse
csc_matrixextendmaxr   )corpusr!   r   r"   r#   printprogressposnowindptrindicesdatadocnodocposnextresultdoc_indicesdoc_datas                   r   
corpus2cscr>   Q   s    F
  	)(I 	'H 	%nG    :8999 "l "l' "lQC(G:RX666x
%000#F++ 	 	JE3 M!6!!; M:E8LLLs3xx'GKN>\c3iiUWY[T\;GFGO$d67?&;MM'"""FF ^^"^^^^(($)@T\H]ej(kk *+BQC&w#F++ 		# 		#JE3 @!6!!; @7??? 25$BCII2r(!KNN;'''KK!!!s3xxGMM'"""" 	;,3:Gq((Iv;;?z$e,,,*W%%(($)@T\H]ej(kkMs    
++c                     |dk     rd}|dk     rd}| j         \  }}t          j        | t          j        ||f          gt          j        |||z   f          gg          S )az  Add additional rows/columns to `mat`. The new rows/columns will be initialized with zeros.

    Parameters
    ----------
    mat : numpy.ndarray
        Input 2D matrix
    padrow : int
        Number of additional rows
    padcol : int
        Number of additional columns

    Returns
    -------
    numpy.matrixlib.defmatrix.matrix
        Matrix with needed padding.

    r   )r   r   blockzeros)matpadrowpadcolrowscolss        r   padrG      sy    $ z z JD$8	bhf~&&'	64&=)	*	*+   r   C   c                 F   t          j        | t           j                  t          j        |          j        z  }t          j        ||z   t           j                  }|j        j         |z  }||||z            	                    |          
                    | |          S )a  Get array aligned at `align` byte boundary in memory.

    Parameters
    ----------
    shape : int or (int, int)
        Shape of array.
    dtype : data-type
        Data type of array.
    order : {'C', 'F'}, optional
        Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory.
    align : int, optional
        Boundary for alignment in bytes.

    Returns
    -------
    numpy.ndarray
        Aligned array.

    r   )order)r   prodint64r   itemsizerA   uint8ctypesr7   viewreshape)r   r   rK   alignnbytesbufferstart_indexs          r   zeros_alignedrW      s    ( WU"(+++bhuoo.FFFXfunBH555F=%%-K+{V33499%@@HHV[H\\\r   c                     t          | t          j                  r| j        dk    pt          j                            |           S )zCheck whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix.

    Parameters
    ----------
    m : object
        Object to check.

    Returns
    -------
    bool
        Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix.

       )
isinstancer   r   ndimr-   r.   issparse)ms    r   ismatrixr^      s6     a$$41P8M8Ma8P8PPr   &.>c                     t          | t          j                  rt          |           S t          j                            |           rt          |           S fd| D             S )a  Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format.

    Parameters
    ----------
    vec : {`numpy.ndarray`, `scipy.sparse`}
        Input vector
    eps : float, optional
        Value used for threshold, all coordinates less than `eps` will not be presented in result.

    Returns
    -------
    list of (int, float)
        Vector in BoW format.

    c                     g | ];\  }}t          j        |          k    t          |          t          |          f<S  r   absintfloat).0fidfwepss      r   
<listcomp>zany2sparse.<locals>.<listcomp>  s@    IIIgc2r

S8HISXXuRyy!IIIr   )rZ   r   r   	dense2vecr-   r.   r\   scipy2sparsevecrj   s    `r   
any2sparserp      sk      #rz"" #c"""|S!! &C%%%IIIIIIIIr   c           	      &   t           j                            |           st          d| z            |dk    rt           j                            g           S | j        d         dk    rt          t          | j                  |d          }| j	        
                    |          | j        
                    |          }}t           j                            ||dt          |          gf          S g }g }dg}t          |           }	t          | j        d                   D ]}
|                     |
          }|	                    |
          }t          |j        |d          }|j	        
                    |          |j        
                    |          }}|                    |           |                    |           |                    |d         t          t          |          |          z              t!          j        |                                          }t!          j        |                                          }t           j        j                            |||f| j        d         t!          j        |          dz   f          S )a  Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix.

    Parameters
    ----------
    matrix : `scipy.sparse`
        Input vector or matrix (1D or 2D sparse array).
    topn : int
        Number of greatest elements, in absolute value, to return.
    eps : float
        Ignored.

    Returns
    -------
    `scipy.sparse.csr.csr_matrix`
        Clipped matrix.

    z"'%s' is not a scipy sparse vector.r   r    Tr   )r   )r-   r.   r\   
ValueError
csr_matrixr   r   rd   r7   r6   r   r*   rangegetrowr,   minr   concatenateravelcsrr1   )matrixr   rj   biggestr6   r7   matrix_indicesmatrix_datamatrix_indptr
matrix_absivv_abss                r   scipy2scipy_clippedr     sI   $ <  (( H=FGGGqy +|&&r***|A! 
#fk**D$???++G44fk6F6Fw6O6O|&&g3w<<7H'IJJJ [[
v|A'' 		N 		NAa  A%%a((Eej$===GINN733QV[[5I5ITGt$$$!!'***  r!2SWt5L5L!LMMMM77==??n[117799|**.-8<?BF>$:$:Q$>? + 
 
 	
r   c                     |                                  } | j        d         dk    sJ fdt          | j        | j                  D             S )au  Convert a scipy.sparse vector into the Gensim bag-of-words format.

    Parameters
    ----------
    vec : `scipy.sparse`
        Sparse vector.

    eps : float, optional
        Value used for threshold, all coordinates less than `eps` will not be presented in result.

    Returns
    -------
    list of (int, float)
        Vector in Gensim bag-of-words format.

    r   r    c                     g | ];\  }}t          j        |          k    t          |          t          |          f<S rb   rc   )rg   posvalrj   s      r   rk   z scipy2sparse.<locals>.<listcomp>O  sF    cccxsCQSQWX[Q\Q\_bQbcSXXuSzz"cccr   )tocsrr   r+   r6   r7   rn   s    `r   rm   rm   <  sQ    " ))++C9Q<1cccc3s{CH3M3Mccccr   c                   $    e Zd ZdZd Zd Zd ZdS )Scipy2CorpuszConvert a sequence of dense/sparse vectors into a streamed Gensim corpus object.

    See Also
    --------
    :func:`~gensim.matutils.corpus2csc`
        Convert corpus in Gensim format to `scipy.sparse.csc` matrix.

    c                     || _         dS )z

        Parameters
        ----------
        vecs : iterable of {`numpy.ndarray`, `scipy.sparse`}
            Input vectors.

        N)vecs)selfr   s     r   __init__zScipy2Corpus.__init__[  s     			r   c              #      K   | j         D ]?}t          |t          j                  rt	          |          V  .t          |          V  @d S N)r   rZ   r   r   full2sparserm   )r   ro   s     r   __iter__zScipy2Corpus.__iter__f  sc      9 	( 	(C#rz** (!#&&&&&&"3''''''		( 	(r   c                 *    t          | j                  S r   )r*   r   r   s    r   __len__zScipy2Corpus.__len__m  s    49~~r   N__name__
__module____qualname____doc__r   r   r   rb   r   r   r   r   R  sK         	 	 	( ( (    r   r   c                     t          j        |t           j                  }d | D             } t          |           } t	          |                                           |t	          |           <   |S )a_  Convert a document in Gensim bag-of-words format into a dense numpy array.

    Parameters
    ----------
    doc : list of (int, number)
        Document in BoW format.
    length : int
        Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly.
        This is typically the vocabulary size or number of topics, depending on how you created `doc`.

    Returns
    -------
    numpy.ndarray
        Dense numpy vector for `doc`.

    See Also
    --------
    :func:`~gensim.matutils.full2sparse`
        Convert dense array to gensim bag-of-words format.

    r   c              3   X   K   | ]%\  }}t          |          t          |          fV  &d S r   )re   rf   )rg   id_val_s      r   	<genexpr>zsparse2full.<locals>.<genexpr>  s7      
:
:{TCHHeDkk"
:
:
:
:
:
:r   )r   rA   float32dictlistvalues)r9   lengthr;   s      r   sparse2fullr   q  s]    , XfBJ///F
:
:c
:
:
:C
s))CSZZ\\**F499Mr   c                     t          j        | t                    } t          j        t	          |           |k              d         }t          t          ||                     |                              S )aO  Convert a dense numpy array into the Gensim bag-of-words format.

    Parameters
    ----------
    vec : numpy.ndarray
        Dense input vector.
    eps : float
        Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and
        won't be included in the BOW result.

    Returns
    -------
    list of (int, float)
        BoW format of `vec`, with near-zero values omitted (sparse vector).

    See Also
    --------
    :func:`~gensim.matutils.sparse2full`
        Convert a document in Gensim bag-of-words format into a dense numpy array.

    r   r   )r   r   rf   nonzerord   r   r+   r   )ro   rj   nnzs      r   r   r     sX    , *S
&
&
&C
*SXX^
$
$Q
'CC#''(((r   c                    |dk    rg S t          j        | t                    } t          j        t	          |           |k              d         }|                    t          t	          |                               |          |d                    }t          t          ||                     |                              S )a  Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).

    This is more efficient that sorting a vector and then taking the greatest values, especially
    where `len(vec) >> topn`.

    Parameters
    ----------
    vec : numpy.ndarray
        Input dense vector
    topn : int
        Number of greatest (abs) elements that will be presented in result.
    eps : float
        Threshold value, if coordinate in `vec` < eps, this will not be presented in result.

    Returns
    -------
    list of (int, float)
        Clipped vector in BoW format.

    See Also
    --------
    :func:`~gensim.matutils.full2sparse`
        Convert dense array to gensim bag-of-words format.

    r   r   Trr   )	r   r   rf   r   rd   r   r   r   r+   )ro   r   rj   r   r}   s        r   full2sparse_clippedr     s    8 qy 	
*S
&
&
&C
*SXX^
$
$Q
'Chhws3xx}}S114FFFGGGGSXXg..//000r   c                    |Rdt          j        |f|          }}t          |           D ]\  }}t          |          |dd|f<   |dz   |k    sJ n t          j        fd| D                       }|                    |          S )a  Convert corpus into a dense numpy 2D array, with documents as columns.

    Parameters
    ----------
    corpus : iterable of iterable of (int, number)
        Input corpus in the Gensim bag-of-words format.
    num_terms : int
        Number of terms in the dictionary. X-axis of the resulting matrix.
    num_docs : int, optional
        Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken.
        Y-axis of the resulting matrix.
    dtype : data-type, optional
        Data type of the output matrix.

    Returns
    -------
    numpy.ndarray
        Dense 2D array that presents `corpus`.

    See Also
    --------
    :class:`~gensim.matutils.Dense2Corpus`
        Convert dense matrix to Gensim corpus format.

    Nrs   r   r    c                 0    g | ]}t          |          S rb   )r   )rg   r9   r!   s     r   rk   z corpus2dense.<locals>.<listcomp>  s#    !P!P!P#+c9"="=!P!P!Pr   )r   r'   r)   r   column_stackastype)r2   r!   r"   r   r8   r;   r9   s    `     r   corpus2denser     s    4  RBHi%:%HHHv#F++ 	; 	;JE3*3	::F111e8qyH$$$$$ !P!P!P!P!P!P!PQQ==r   c                   &    e Zd ZdZddZd Zd ZdS )Dense2Corpusa  Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format.

    Notes
    -----
    No data copy is made (changes to the underlying matrix imply changes in the streamed corpus).

    See Also
    --------
    :func:`~gensim.matutils.corpus2dense`
        Convert Gensim corpus to dense matrix.
    :class:`~gensim.matutils.Sparse2Corpus`
        Convert sparse matrix to Gensim corpus format.

    Tc                 4    |r|j         | _        dS || _        dS )z

        Parameters
        ----------
        dense : numpy.ndarray
            Corpus in dense format.
        documents_columns : bool, optional
            Documents in `dense` represented as columns, as opposed to rows?

        N)Tdense)r   r   documents_columnss      r   r   zDense2Corpus.__init__  s$      	DJJJDJJJr   c              #   J   K   | j         D ]}t          |j                  V  dS )zIterate over the corpus.

        Yields
        ------
        list of (int, float)
            Document in BoW format.

        N)r   r   flat)r   r9   s     r   r   zDense2Corpus.__iter__  s<       : 	( 	(Cch''''''	( 	(r   c                 *    t          | j                  S r   )r*   r   r   s    r   r   zDense2Corpus.__len__)  s    4:r   NTr   rb   r   r   r   r     sP             
( 
( 
(    r   r   c                   ,    e Zd ZdZddZd Zd Zd ZdS )	Sparse2Corpusa,  Convert a matrix in scipy.sparse format into a streaming Gensim corpus.

    See Also
    --------
    :func:`~gensim.matutils.corpus2csc`
        Convert gensim corpus format to `scipy.sparse.csc` matrix
    :class:`~gensim.matutils.Dense2Corpus`
        Convert dense matrix to gensim corpus.

    Tc                 |    |r|                                 | _        dS |                                j        | _        dS )z

        Parameters
        ----------
        sparse : `scipy.sparse`
            Corpus scipy sparse format
        documents_columns : bool, optional
            Documents will be column?

        N)tocscr.   r   r   )r   r.   r   s      r   r   zSparse2Corpus.__init__8  s5      	+ ,,..DKKK ,,..*DKKKr   c           	   #      K   t          | j        j        | j        j        dd                   D ]H\  }}t          t          | j        j        ||         | j        j        ||                             V  IdS )zj

        Yields
        ------
        list of (int, float)
            Document in BoW format.

        r    N)r+   r.   r5   r   r6   r7   )r   indprevindnows      r   r   zSparse2Corpus.__iter__H  s        #4;#5t{7I!""7MNN 	c 	cOGVs4;.wv~>@PQXY_Q_@`aabbbbbb	c 	cr   c                 &    | j         j        d         S )Nr    )r.   r   r   s    r   r   zSparse2Corpus.__len__T  s    { ##r   c                 j   | j         }t          |t                    r^| j         j        |         }| j         j        |dz            }t	          t          |j        ||         |j        ||                             S | j                             t          ddd          |f          }t          |          S )a  
        Retrieve a document vector or subset from the corpus by key.

        Parameters
        ----------
        key: int, ellipsis, slice, iterable object
            Index of the document retrieve.
            Less commonly, the key can also be a slice, ellipsis, or an iterable
            to retrieve multiple documents.

        Returns
        -------
        list of (int, number), Sparse2Corpus
            Document in BoW format when `key` is an integer. Otherwise :class:`~gensim.matutils.Sparse2Corpus`.
        r    N)r.   rZ   re   r5   r   r+   r6   r7   __getitem__slicer   )r   keyr.   iprevinows        r   r   zSparse2Corpus.__getitem__W  s      c3 	RK&s+E;%cAg.DFN5:6E$J8OPPQQQ((%dD*A*A3)GHHV$$$r   Nr   )r   r   r   r   r   r   r   r   rb   r   r   r   r   -  sb        	 	+ + + + 
c 
c 
c$ $ $% % % % %r   r   c                     t          |           dk    rdS dt          j        t          d | D                                 z  }|dk    s
J d            |S )zCalculate L2 (euclidean) length of a vector.

    Parameters
    ----------
    vec : list of (int, number)
        Input vector in sparse bag-of-words format.

    Returns
    -------
    float
        Length of `vec`.

    r                 ?c              3   &   K   | ]\  }}|d z  V  dS rY   Nrb   rg   _r   s      r   r   zveclen.<locals>.<genexpr>  s*       : :FAsa : : : : : :r   ;sparse documents must not contain any explicit zero entries)r*   mathsqrtsumro   r   s     r   veclenr   q  sd     3xx1} s49S : :c : : :::;;;FC<VVVVVVMr   c                 J    dk    rfd| D             S t          |           S )a#  Normalize a vector in L2 (Euclidean unit norm).

    Parameters
    ----------
    vec : list of (int, number)
        Input vector in BoW format.
    length : float
        Length of vector

    Returns
    -------
    list of (int, number)
        L2-normalized vector in BoW format.

    r   c                 $    g | ]\  }}||z  fS rb   rb   )rg   termidr   r   s      r   rk   z&ret_normalized_vec.<locals>.<listcomp>  s&    >>>;63v&>>>r   )r   r   s    `r   ret_normalized_vecr     s7      } >>>>#>>>>Cyyr   r    c                 ,   d}t          | j                  dk    rt          j        |           }|t          j        t          |           dz             z
  |z
  }t          j        t          j        | |z                       }t          j        |          |z
  }| |z  } n|dk    rt          j        | d          }|t          j        | j        d         dz             z
  |z
  }t          j        t          j        | |d d t          j        f         z             d          }t          j        |          |z
  }| |d d t          j        f         z
  } nA|dk    r)t          | j	                  }|d         j	        |d         fS t          d|z            | |fS )N      Y@r    r   r   z'%s' is not a supported axis)r*   r   r   r1   logr   expnewaxisret_log_normalize_vecr   rt   )ro   axislog_maxmax_val	log_shifttotlog_normks           r   r   r     sq   G
39~~ D&++bfSXX^444w>	fRVC)O,,--6#;;*x19 
	DfS!nnG"&1);"<"<<wFI&i2:&> >??CCCvc{{Y.HBJ//CCQY 	D%ce,,AQ461Q4<;dBCCC=r   nrm2r   scall2c                    d}||vrt          d|d|d          t          j                            |           r|                                 } |dk    r+t          j        t          j        | j                            }|dk    r.t          j	        t          j        | j        dz                      }|dk    r| j
        }|d	k    rKt          j        | j        t
          j                  r|                     t                    } | |z  } |r| |fS | S |r| d
fS | S t!          | t
          j                  r|dk    r&t          j        t          j        |                     }|dk    r| j        dk    rd	}nt'          |           }|dk    rt          j        |           }|d	k    rt          j        | j        t
          j                  r|                     t                    } |r-t+          d
|z  |                               | j                  |fS t+          d
|z  |                               | j                  S |r| d
fS | S 	 t-          t/          |                     }n# t0          $ r |r| d
fcY S | cY S w xY wt!          |t2          t4          f          rt7          |          dk    r|dk    r&t          t          d | D                                 }|dk    r.d
t9          j	        t          d | D                                 z  }|dk    rd
t7          |           z  }|d	k    s
J d            |rt;          | |          |fS t;          | |          S t          d          )au  Scale a vector to unit length.

    Parameters
    ----------
    vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
        Input vector in any format
    norm : {'l1', 'l2', 'unique'}, optional
        Metric to normalize in.
    return_norm : bool, optional
        Return the length of vector `vec`, in addition to the normalized vector itself?

    Returns
    -------
    numpy.ndarray, scipy.sparse, list of (int, float)}
        Normalized vector in same format as `vec`.
    float
        Length of `vec` before normalization, if `return_norm` is set.

    Notes
    -----
    Zero-vector will be unchanged.

    )l1r   unique'z9' is not a supported norm. Currently supported norms are .r   r   rY   r   r   r   r   c              3   :   K   | ]\  }}t          |          V  d S r   rd   r   s      r   r   zunitvec.<locals>.<genexpr>
  s,      ::FAss3xx::::::r   c              3   &   K   | ]\  }}|d z  V  dS r   rb   r   s      r   r   zunitvec.<locals>.<genexpr>  s*      (D(Dfa(D(D(D(D(D(Dr   r   zunknown input type)rt   r-   r.   r\   r   r   r   rd   r7   r   r   
issubdtyper   integerr   rf   rZ   r   r   	blas_nrm2count_nonzero	blas_scalnextiterStopIterationtupler   r*   r   r   )ro   normreturn_normsupported_normsr   firstr   s          r   unitvecr     s   0 -O?" vj^b^b^bdsdsdstuuu|S!! iikk4< 	.VBF38,,--F4< 	4WRVCHM2233F8 	WFC< 	}SY
33 (jj''6MC F{"
 Cx
#rz"" 4< 	)VBF3KK((F4< 	(x1} ("38 	+%c**FC< 	}SY
33 (jj'' F vs33::39EEvMM vs33::39EEE Cx
T#YY    	8OOOJJJ	 %%'' /CJJ!O /4< 	<3::c:::::;;F4< 	F49S(D(D(D(D(D%D%DEEEF8 	$3s88^F|ZZZZZZ 	3%c622F::%c6222-...s   I# #I:5I:9I:c                 2   t          |           t                    c} | rsdS dt          j        t          d |                                 D                                 z  }dt          j        t          d                                 D                                 z  }|dk    r|dk    s
J d            t                    t          |           k     r| c} t          fd|                                 D                       }|||z  z  }|S )a  Get cosine similarity between two sparse vectors.

    Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar.

    Parameters
    ----------
    vec1 : list of (int, float)
        Vector in BoW format.
    vec2 : list of (int, float)
        Vector in BoW format.

    Returns
    -------
    float
        Cosine similarity between `vec1` and `vec2`.

    r   r   c              3       K   | ]	}||z  V  
d S r   rb   rg   r   s     r   r   zcossim.<locals>.<genexpr>-  &      !E!E#)!E!E!E!E!E!Er   c              3       K   | ]	}||z  V  
d S r   rb   r  s     r   r   zcossim.<locals>.<genexpr>.  r  r   r   c              3   P   K   | ] \  }}|                     |d           z  V  !dS )r   N)get)rg   indexvaluevec2s      r   r   zcossim.<locals>.<genexpr>2  s:      OO,%%---OOOOOOr   )r   r   r   r   r   r*   items)vec1r
  vec1lenvec2lenr;   s    `   r   cossimr    s   $ dT$ZZJD$ t sDIc!E!Et{{}}!E!E!EEEFFFGDIc!E!Et{{}}!E!E!EEEFFFGS=iWs]ii,iiii
4yy3t99  4
dOOOO$**,,OOOOOF
gFMr   c                 .   t           j                            |           r&|                                                                 } 	 | d         \  }}t          |          t          |          f n$# t          $ r Y dS t          t          f$ r Y dS w xY wdS )zChecks if a vector is in the sparse Gensim bag-of-words format.

    Parameters
    ----------
    vec : object
        Object to check.

    Returns
    -------
    bool
        Is `vec` in BoW format.

    r   TF)
r-   r.   r\   todensetolistre   rf   
IndexErrorrt   	TypeError)ro   r   r   s      r   isbowr  7  s     |S!! %kkmm""$$F	TC%++   tt	"   uu4s   )A1 1
B>BBc                 f   t           j                            |           r|                                 } t           j                            |          r|                                }t	          |           rt	          |          rt|$t          | |          }t          ||          }||fS t          t          |           t          |                    }t          | |          }t          ||          }||fS t          |           dk    r| d         } t          |          dk    r|d         }| |fS )Nr    r   )r-   r.   r\   toarrayr  r   r1   r*   )r  r
  num_featuresdense1dense2max_lens         r   _convert_vecr  Q  s   |T"" ||~~|T"" ||~~T{{ uT{{  	" |44F |44F6>!#d))SYY//G w//F w//F6>! t99> 	7Dt99> 	7DTzr   c                 L    t          | ||          \  } }t          | |          S )uQ  Calculate Kullback-Leibler distance between two probability distributions using `scipy.stats.entropy`.

    Parameters
    ----------
    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    num_features : int, optional
        Number of features in the vectors.

    Returns
    -------
    float
        Kullback-Leibler distance between `vec1` and `vec2`.
        Value in range [0, +∞) where values closer to 0 mean less distance (higher similarity).

    r  r  r   )r  r
  r  s      r   kullback_leiblerr   j  s,    & dD|DDDJD$4r   c                     t          | ||          \  } }d| |z   z  }dt          | |          t          ||          z   z  S )aZ  Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`.

    Parameters
    ----------
    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    num_features : int, optional
        Number of features in the vectors.

    Returns
    -------
    float
        Jensen-Shannon distance between `vec1` and `vec2`.

    Notes
    -----
    This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`.

    r        ?r  )r  r
  r  avg_vecs       r   jensen_shannonr$    sN    , dD|DDDJD$TD[!G'$((74+A+AABBr   c                     t           j                                       r                                  t           j                                      r                                t	                     rt	                    rt                     t                    c t          t                                                     t                                                    z             }t          j
        dt           fd|D                       z            }|S t          j
        dt          j
                   t          j
                  z
  dz                                  z            }|S )a  Calculate Hellinger distance between two probability distributions.

    Parameters
    ----------
    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.

    Returns
    -------
    float
        Hellinger distance between `vec1` and `vec2`.
        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).

    r"  c              3      K   | ]X}t          j                            |d                     t          j                            |d                     z
  dz  V  YdS )r   rY   N)r   r   r  )rg   r  r  r
  s     r   r   zhellinger.<locals>.<genexpr>  s_      nn]brwtxxs3344rwtxxs?S?S7T7TTWXXnnnnnnr   rY   )r-   r.   r\   r  r  r   setr   keysr   r   r   )r  r
  r6   sims   ``  r   	hellingerr*    s4   " |T"" ||~~|T"" ||~~T{{ 
uT{{ 
$ZZd
dd499;;''$tyy{{*;*;;<<g#nnnnnfmnnnnnn
 
 
gcbgdmmbgdmm;a?DDFFFGG
r   c           	         t           j                            |           r|                                 } t           j                            |          r|                                }t	          |           rt	          |          rt          d | D                       t          d |D                       z   }t          |           t          |          }} d}|                                 D ],\  }}|t          ||	                    |d                    z  }-dt          |          t          |          z  z
  S t          | t          j                  r|                                 } t          |t          j                  r|                                }t          |           } t          |          }| |z  }| |z  }dt          t!          |                    t          t!          |                    z  z
  S )a  Calculate Jaccard distance between two vectors.

    Parameters
    ----------
    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.
    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
        Distribution vector.

    Returns
    -------
    float
        Jaccard distance between `vec1` and `vec2`.
        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).

    c              3       K   | ]	\  }}|V  
d S r   rb   rg   r   weights      r   r   zjaccard.<locals>.<genexpr>  s&      33{sFF333333r   c              3       K   | ]	\  }}|V  
d S r   rb   r-  s      r   r   zjaccard.<locals>.<genexpr>  s&      9Y9Y[S&&9Y9Y9Y9Y9Y9Yr   r   r    )r-   r.   r\   r  r  r   r   r  rx   r  rf   rZ   r   r   r  r'  r*   )r  r
  unionintersection
feature_idfeature_weights         r   jaccardr4    s   & |T"" ||~~|T"" ||~~T{{ @uT{{ @ 33d33333c9Y9YTX9Y9Y9Y6Y6YY$ZZdd*.**,, 	K 	K&JCS0I0IJJJLL5&&u555 dBJ'' 	!;;==DdBJ'' 	!;;==D4yy4yyd{t5\**++eCJJ.?.????r   c                     t          | |z            }|dk    rdS dt          t          | |z                      t          |          z  z
  S )a]  Calculate Jaccard distance between two sets.

    Parameters
    ----------
    set1 : set
        Input set.
    set2 : set
        Input set.

    Returns
    -------
    float
        Jaccard distance between `set1` and `set2`.
        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
    r   r   )r*   rf   )set1set2union_cardinalitys      r   jaccard_distancer9    sU    " D4K((A rc$+&&''%0A*B*BBBBr   )	logsumexpmean_absolute_differencedirichlet_expectationc                     t          j        |           }t          j        t          j        t          j        | |z
                                } | |z  } | S )a  Log of sum of exponentials.

        Parameters
        ----------
        x : numpy.ndarray
            Input 2d matrix.

        Returns
        -------
        float
            log of sum of exponentials of elements in `x`.

        Warnings
        --------
        For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.

        )r   r1   r   r   r   )r   x_maxs     r   r:  r:  	  sF    $ q		F26"&U++,,--	U
r   c                 T    t          j        t          j        | |z
                      S )a  Mean absolute difference between two arrays.

        Parameters
        ----------
        a : numpy.ndarray
            Input 1d array.
        b : numpy.ndarray
            Input 1d array.

        Returns
        -------
        float
            mean(abs(a - b)).

        )r   meanrd   )abs     r   r;  r;  !  s       wrva!e}}%%%r   c                 Z   t          | j                  dk    r2t          |           t          t          j        |                     z
  }nFt          |           t          t          j        | d                    ddt          j        f         z
  }|                    | j        d          S )a  Expected value of log(theta) where theta is drawn from a Dirichlet distribution.

        Parameters
        ----------
        alpha : numpy.ndarray
            Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.

        Returns
        -------
        numpy.ndarray
            Log of expected values, dimension same as `alpha.ndim`.

        r    NF)copy)r*   r   r	   r   r   r   r   r   )alphar;   s     r   r<  r<  3  s     u{q  	GZZ#bfUmm"4"44FFZZ#bfUA&6&6"7"72:"FFF}}U[u}555r   c                 \   t          j        | d                   }| d= ~ |j        \  }}t                              dt          |j                             t          d|f          \  } ||dd          \  }}}} |||d         d          \  }}}}~|dk    sJ t          |d|d|f                   }	||k     r|ddd|f         }t          d|f          \  }
 |
||dd          \  }}} |
|||d         d          \  }}}|dk    s
J d	            |j        j	        sJ ||	fS )
a  Get QR decomposition of `la[0]`.

    Parameters
    ----------
    la : list of numpy.ndarray
        Run QR decomposition on the first elements of `la`. Must not be empty.

    Returns
    -------
    (numpy.ndarray, numpy.ndarray)
        Matrices :math:`Q` and :math:`R`.

    Notes
    -----
    Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`,
    because the memory used in `la[0]` is reclaimed earlier. This makes a difference when
    decomposing very large arrays, where every memory copy counts.

    Warnings
    --------
    Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons.

    r   zcomputing QR of %s dense matrix)geqrfrs   T)lworkoverwrite_aN)orgqrz	qr failed)
r   asfortranarrayr   r%   debugstrr   r   flagsf_contiguous)larA  r]   nrG  qrtauworkr&   rgorgqrqs               r   
qr_destroyrX  H  st   0 	"Q%  A
1r7DAq
LL2CLLAAAj1$//FE%>>>BT4%adCCCBT4	19RBQBZA1u 2A2YzB511GFF2s"$???MAtTF2s$q'tDDDMAtT19!!k!!!7a4Kr   c                   R    e Zd ZdZdZd Zd Zd Zd Ze	dd
            Z
d Zd Zd	S )MmWritera  Store a corpus in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_,
    using :class:`~gensim.corpora.mmcorpus.MmCorpus`.

    Notes
    -----
    The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`).
    This allows you to write corpora which are larger than the available RAM.

    The output file is created in a single pass through the input corpus, so that the input can be
    a once-only stream (generator).

    To achieve this, a fake MM header is written first, corpus statistics are collected
    during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file,
    rewriting the fake header with the final values.

    s.   %%MatrixMarket matrix coordinate real general
c                     || _         |                    d          s|                    d          rt          d          t          j        | j         d          | _        d| _        dS )zf

        Parameters
        ----------
        fname : str
            Path to output file.

        z.gzz.bz2z-compressed output not supported with MmWriterzwb+FN)fnameendswithNotImplementedErrorr   openfoutheaders_written)r   r\  s     r   r   zMmWriter.__init__  sf     
>>%   	WENN6$:$: 	W%&UVVVJtz511	$r   c           
         | j                             t          j                   |dk     rMt                              d| j                   | j                             t          j        d                     nXt                              d|||| j                   | j                             t          j        |d|d|d                     d| _	        d| _
        d	S )
a  Write headers to file.

        Parameters
        ----------
        num_docs : int
            Number of documents in corpus.
        num_terms : int
            Number of term in corpus.
        num_nnz : int
            Number of non-zero elements in corpus.

        r   zsaving sparse matrix to %sz3                                                  
z9saving sparse %sx%s matrix with %i non-zero entries to %s 
rs   TN)r`  writerZ  HEADER_LINEr%   r&   r\  r   to_utf8
last_docnora  )r   r"   r!   r#   s       r   write_headerszMmWriter.write_headers  s     		,---Q; 		ZKK4djAAAIOOEM/::;;;;KKK)Wdj   IOOEM(((IIIwww*WXXYYY#r   c                    d|||fz  }t          |          dk    rt          d          | j                            t          t          j                             | j                            t          j        |                     dS )aM  Write "fake" headers to file, to be rewritten once we've scanned the entire corpus.

        Parameters
        ----------
        num_docs : int
            Number of documents in corpus.
        num_terms : int
            Number of term in corpus.
        num_nnz : int
            Number of non-zero elements in corpus.

        z%i %i %i2   z Invalid stats: matrix too large!N)	r*   rt   r`  seekrZ  rf  re  r   rg  )r   r"   r!   r#   statss        r   fake_headerszMmWriter.fake_headers  sz     h	7;;u::? 	A?@@@	s8/00111	e,,-----r   c           	      d   | j         s
J d            | j        |k     sJ d| j        |fz              t          d |D                       }|D ]=\  }}| j                            t          j        d|dz   |dz   |fz                       >|| _        |r|d         d         t          |          fndS )	ai  Write a single sparse vector to the file.

        Parameters
        ----------
        docno : int
            Number of document.
        vector : list of (int, number)
            Document in BoW format.

        Returns
        -------
        (int, int)
            Max word index in vector and len of vector. If vector is empty, return (-1, 0).

        z:must write Matrix Market file headers before writing data!z,documents %i and %i not in sequential order!c              3   J   K   | ]\  }}t          |          d k    ||fV  dS )g-q=Nr   )rg   r   ws      r   r   z(MmWriter.write_vector.<locals>.<genexpr>  s9      DD41aSVVe^DADDDDDDr   z	%i %i %s
r    rs   r   )rs   r   )ra  rh  sortedr`  re  r   rg  r*   )r   r8   vectorr   r.  s        r   write_vectorzMmWriter.write_vector  s      #aa%aaaa&qq(VZ^ZikpYq(qqqqDD6DDDDD$ 	[ 	[NFFIOOEM,%!)VaZQW9X*XYYZZZZ/5Br
1s6{{++7Br     FNc           	      :   t          |           }|                    ddd           d\  }}d\  }	}
g }t          |d          r|j        }||_        |ri }nd}t	          |          D ]\  }	}|r|\  }}|||	<   n|}|	|z  dk    rt
                              d|	           |r;|j                                        }||
k    rd|d<   |	                    |           |}
|
                    |	|          \  }}t          |d|z             }||z  }|rt          j        || d	z              ||_        |	dz   }|p|}||z  dk    r+t
                              d
||d|z  ||z  z  |||z             |                    |||           |                                 |r|S dS )aS  Save the corpus to disk in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_.

        Parameters
        ----------
        fname : str
            Filename of the resulting file.
        corpus : iterable of list of (int, number)
            Corpus in streamed bag-of-words format.
        progress_cnt : int, optional
            Print progress for every `progress_cnt` number of documents.
        index : bool, optional
            Return offsets?
        num_terms : int, optional
            Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
        metadata : bool, optional
            Generate a metadata file?

        Returns
        -------
        offsets : {list of int, None}
            List of offsets (if index=True) or nothing.

        Notes
        -----
        Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM.

        See Also
        --------
        :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus`
            Save corpus to disk.

        rs   )r   r   )rs   rs   metadataFr   zPROGRESS: saving document #%ir    z.metadata.cpicklez*saved %ix%i matrix, density=%.3f%% (%i/%i)r   N)rZ  ri  r   rw  r)   r%   r&   r`  tellr,   rt  r1   r   picklern  close)r\  r2   progress_cntr  r!   rw  mw
_num_termsr#   r8   poslastoffsetsorig_metadatadocno2metadatar9   bowr7   r4   max_idr   r"   s                        r   write_corpuszMmWriter.write_corpus  s   D e__ 	R$$$ #
Gw6:&& 	"OM&FO $!#H#F++ 	 	JE3 	T(,u%%|#q( D;UCCC !W$ %"$GBKv&&& __UC88NFFZV44JvGG 	,L1D)DEEE+FO19+	i1$ 	KK<)UW_98L%MwX`clXl   	)W555



 	N	 	r   c                 .    |                                   dS )zClose `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`.

        Warnings
        --------
        Closing the file explicitly via the close() method is preferred and safer.

        N)rz  r   s    r   __del__zMmWriter.__del__6  s     	

r   c                     t                               d| j                   t          | d          r| j                                         dS dS )zClose `self.fout` file.z
closing %sr`  N)r%   rL  r\  r   r`  rz  r   s    r   rz  zMmWriter.close@  sL    \4:...4   	IOO	 	r   )ru  FNF)r   r   r   r   rf  r   ri  rn  rt  staticmethodr  r  rz  rb   r   r   rZ  rZ  u  s           EK% % %$ $ $8. . .&C C C2 T T T \Tl      r   rZ  )MmReader)NF)rH   rI   )r_   )r    )r   Fr   )Dr   
__future__r   loggingr   gensimr   numpyr   scipy.sparser-   scipy.statsr   scipy.linalgr   r   scipy.linalg.lapackr   scipy.specialr	   	getLoggerr   r%   r   r   float64r>   rG   rW   r^   rp   r   rm   r   r   r   rl   r   r   r   r   r   r   r   r   arrayrf   r   r   r   r  r  r  r   r$  r*  r4  r9  gensim._matutilsr:  r;  r<  ImportErrorrX  rZ  gensim.corpora._mmreaderr  	NO_CYTHONrb   r   r   <module>r     s     % % % % % %                       - - - - - - - - 0 0 0 0 0 0       
	8	$	$2 2 2&? ? ? ?D "&RZ$PTde S S S Sl  :] ] ] ]4Q Q Q"J J J J.3
 3
 3
 3
ld d d d,       >  @) ) ) )6 	!1 !1 !1 !1H .2 (  (  (  ( V, , , , , , , ,^A% A% A% A% A% A% A% A%H  *  ,   . D"E22233	D"E22233	^/ ^/ ^/ ^/B  >  4   2   .C C C C6  D+@ +@ +@\C C C0A6[[[[[[[[[[[ =6 =6 =6  0& & &$6 6 6 6 6W=6@* * *ZO O O O O O O Od11111111   
/s   
D   D10D1E E