
    cH                       d Z ddlmZ ddlmZ ddlZddlZddlZddl	Z	ddl
mZ ddlZddlZddlZddlZddlZddlZddlZddlmZ ddlZddlZddlZddlZddlZddlZddlmZ ddlmZ ddl Z ddl!Z!ddl"Z#ddl$Z%dd	l&m'Z' dd
l(m)Z*  ej+        e,          Z-dZ. ej/        dej0                  Z1 ej/        dej0                  Z2 e3d          Z4	 e#j        5                                Z6d Z7d Z8d Z9ed             Z:d Z;d Z<d_dZ=d Z>d`dZ?dadZ@e@ZAdbdZBeBZCd ZD G d  d!          ZEd" ZFd# ZG G d$ d%          ZHd& ZId' ZJd( ZK G d) d*eE          ZL G d+ d,eE          ZM G d- d.eE          ZN G d/ d0eE          ZOd1 ZPd2 ZQde#jR        fd3ZSeSZT G d4 d5ejU                  ZVejW        d6k    sej         d7k    rejX        d8k    rdcd9ZYndcd:ZYd; ZZe.fd<Zd= Z[d> Z\d? Z]ed@             Z^ e]dA          dddC            Z_dedEZ` e]dA          dfdG            ZadgdIZbdhdJZcdidMZddjdNZedkdOZfdkdPZgdQ ZhdR ZidZjdSZkdZldkdTZmejn        fdUZodldVZpdW ZqdmdXZrdndYZsdZ Ztd[ Zud\ Zvd] Zwd^ ZxdS )oz"Various general utility functions.    )with_statement)contextmanagerN)name2codepointwraps)deepcopy)datetime)open)__version__   z(((?![\d])\w)+)z&(#?)([xX]?)(\w{1,8});a  Compiled extensions are unavailable. If you've installed from a package, ask the package maintainer to include compiled extensions. If you're building Gensim from source yourself, install Cython and a C compiler, and then run `python setup.py build_ext --inplace` to retry. c                 >   | | t           j        u rt           j        j        j        S t	          | t
          j        t           j        f          rt           j                            |           S t	          | t           j        j                  r| S t          d| z            )a  Generate :class:`numpy.random.RandomState` based on input seed.

    Parameters
    ----------
    seed : {None, int, array_like}
        Seed for random state.

    Returns
    -------
    :class:`numpy.random.RandomState`
        Random state.

    Raises
    ------
    AttributeError
        If seed is not {None, int, array_like}.

    Notes
    -----
    Method originally from `maciejkula/glove-python <https://github.com/maciejkula/glove-python>`_
    and written by `@joshloyal <https://github.com/joshloyal>`_.

    Nz:%r cannot be used to seed a np.random.RandomState instance)
nprandommtrand_rand
isinstancenumbersIntegralintegerRandomState
ValueError)seeds    ,lib/python3.11/site-packages/gensim/utils.pyget_random_stater   A   s    0  &try( &y%%$)2:677 +y$$T***$	-.. 
QTXX
Y
YY    c                       fd}|S )zA decorator to place an instance-based lock around a method.

    Notes
    -----
    Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/.

    c                 @     t                      fd            }|S )Nc                 >   t          |           }t                              dj                   |5  t                              dj                    | g|R i |}t                              dj                   |cd d d            S # 1 swxY w Y   d S )Nzacquiring lock %r for %szacquired lock %r for %szreleasing lock %r for %s)getattrloggerdebug__name__)selfargskwargstlockresultfunc	tlocknames        r   _synchronizerz4synchronous.<locals>._synched.<locals>._synchronizerk   s    D),,ELL3YNNN  6	4=QQQd4T444V447DMRRR	                 s   ABBBr   )r(   r*   r)   s   ` r   _synchedzsynchronous.<locals>._synchedj   s:    	t	 	 	 	 	 
	 r    )r)   r+   s   ` r   synchronousr-   b   s#         Or   c                 z    t          | t                    rt          | d          S |                     d           | S )a4  Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file.

    Parameters
    ----------
    input : str or file-like
        Filename or file-like object.

    Returns
    -------
    file-like object
        An open file, positioned at the beginning.

    rbr   )r   strr
   seek)inputs    r   file_or_filenamer3   y   s<     % E4    	

1r   c              #     K   t          |           }d}	 |V  nC# t          $ r6 d}t          | t                    r |j        t          j                     s Y nw xY w|s.t          | t                    r|                    ddd           dS dS dS # |s-t          | t                    r|                    ddd           w w w xY w)a  Provide "with-like" behaviour without closing the file object.

    Parameters
    ----------
    input : str or file-like
        Filename or file-like object.

    Yields
    -------
    file
        File-like object based on input (or input if this already file-like).

    FTN)r3   	Exceptionr   r0   __exit__sysexc_info)r2   mgrexcs      r   	open_filer;      s	      5
!
!C
C
+				   %%% 	\S\3<>>-J 		 	  	+z%-- 	+LLtT*****	+ 	+ 	+ 	+s 	+z%-- 	+LLtT****	+ 	+s&    B =AB AB 2Cc                     t          | t                    s|                     d          } t          j        d|           }d                    d |D                       }t          j        d|          S )u  Remove letter accents from the given string.

    Parameters
    ----------
    text : str
        Input string.

    Returns
    -------
    str
        Unicode string without accents.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import deaccent
        >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
        u'Sef chomutovskych komunistu dostal postou bily prasek'

    utf8NFD c              3   J   K   | ]}t          j        |          d k    |V  dS )MnN)unicodedatacategory).0chs     r   	<genexpr>zdeaccent.<locals>.<genexpr>   s7      KKB+*>r*B*Bd*JKRKKKKKKr   NFC)r   r0   decoderB   	normalizejoin)textnormr'   s      r   deaccentrM      sl    , dC   #{{6"" --DWWKK$KKKKKF ///r   c                     t           j        }	 t          j        t           _        t          j        | |           |t           _        dS # |t           _        w xY w)a$  Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying.

    Parameters
    ----------
    source : str
        Path to source directory
    dest : str
        Path to destination directory

    Warnings
    --------
    Available on UNIX systems only.

    N)shutilcopy2oslinkcopytree)sourcedestrP   s      r   copytree_hardlinkrV      sI     LEw%%%us   +A AFr=   strictc                     |p|p|}t          | ||          } |r|                                 } |rt          |           } t          |           S )u  Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.

    Parameters
    ----------
    text : str or bytes
        Input string.
    deacc : bool, optional
        Remove accentuation using :func:`~gensim.utils.deaccent`?
    encoding : str, optional
        Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`.
    errors : str, optional
        Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`.
    lowercase : bool, optional
        Lowercase the input string?
    to_lower : bool, optional
        Same as `lowercase`. Convenience alias.
    lower : bool, optional
        Same as `lowercase`. Convenience alias.

    Yields
    ------
    str
        Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize`

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import tokenize
        >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
        [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']

    errors)
to_unicodelowerrM   simple_tokenize)rK   	lowercasedeaccencodingrZ   to_lowerr\   s          r   tokenizerb      s^    D .X.IdHV444D zz|| ~~4   r   c              #   p   K   t                               |           D ]}|                                V  dS )zTokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`.

    Parameters
    ----------
    text : str
        Input text.

    Yields
    ------
    str
        Tokens from `text`.

    N)PAT_ALPHABETICfinditergroup)rK   matchs     r   r]   r]     sF        ((..  kkmm r         c                 J    fdt          | d|d          D             }|S )aq  Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.

    Uses :func:`~gensim.utils.tokenize` internally.

    Parameters
    ----------
    doc : str
        Input document.
    deacc : bool, optional
        Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
    min_len : int, optional
        Minimum length of token (inclusive). Shorter tokens are discarded.
    max_len : int, optional
        Maximum length of token in result (inclusive). Longer tokens are discarded.

    Returns
    -------
    list of str
        Tokens extracted from `doc`.

    c                 x    g | ]6}t          |          cxk    rk    n n|                    d           4|7S )_)len
startswith)rD   tokenmax_lenmin_lens     r   
<listcomp>z%simple_preprocess.<locals>.<listcomp>6  sx       c%jj   $+    494D4DS4I4I  r   Tignore)r\   r_   rZ   )rb   )docr_   rq   rp   tokenss     `` r   simple_preprocessrv      sI    ,    #Ct5RRR  F Mr   c                     t          | t                    r|                     d          S t          | ||                              d          S )az  Convert a unicode or bytes string in the given encoding into a utf8 bytestring.

    Parameters
    ----------
    text : str
        Input text.
    errors : str, optional
        Error handling behaviour if `text` is a bytestring.
    encoding : str, optional
        Encoding of `text` if it is a bytestring.

    Returns
    -------
    str
        Bytestring in utf8.

    r=   rY   )r   r0   encode)rK   rZ   r`   s      r   any2utf8ry   =  sK    & $ #{{6"""tXf---44V<<<r   c                 T    t          | t                    r| S t          | ||          S )av  Convert `text` (bytestring in given encoding or unicode) to unicode.

    Parameters
    ----------
    text : str
        Input text.
    errors : str, optional
        Error handling behaviour if `text` is a bytestring.
    encoding : str, optional
        Encoding of `text` if it is a bytestring.

    Returns
    -------
    str
        Unicode version of `text`.

    rY   )r   r0   )rK   r`   rZ   s      r   any2unicoder{   Y  s0    $ $ tXf----r   c                       t          d          )aC  Helper to raise `AttributeError` if a class method is called on an instance. Used internally.

    Parameters
    ----------
    *args
        Variable length argument list.
    **kwargs
        Arbitrary keyword arguments.

    Raises
    ------
    AttributeError
        If a class method is called on an instance.

    z/This method should be called on a class object.)AttributeError)r$   r%   s     r   call_on_class_onlyr~   s  s      J
K
KKr   c                       e Zd ZdZej        fdZedd            Zd Z	e
d             Zdd e            efdZd	 Zdd e            efd
ZdS )SaveLoada  Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.

    Warnings
    --------
    This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
    such as lambda functions etc.

    c                    t          |          }t          j                                                    |d<   t          |d<   t
          j        |d<   t          j                    |d<   ||d<   t          | d          s,t          
                    d| j        j                   g | _        |r't                              |d| j        j        |           | j        | j                            |           d	S d	S )
a  
        Append an event into the `lifecycle_events` attribute of this object, and also
        optionally log the event at `log_level`.

        Events are important moments during the object's life, such as "model created",
        "model saved", "model loaded", etc.

        The `lifecycle_events` attribute is persisted across object's :meth:`~gensim.utils.SaveLoad.save`
        and :meth:`~gensim.utils.SaveLoad.load` operations. It has no impact on the use of the model,
        but is useful during debugging and support.

        Set `self.lifecycle_events = None` to disable this behaviour. Calls to `add_lifecycle_event()`
        will not record events into `self.lifecycle_events` then.

        Parameters
        ----------
        event_name : str
            Name of the event. Can be any label, e.g. "created", "stored" etc.
        event : dict
            Key-value mapping to append to `self.lifecycle_events`. Should be JSON-serializable, so keep it simple.
            Can be empty.

            This method will automatically add the following key-values to `event`, so you don't have to specify them:

            - `datetime`: the current date & time
            - `gensim`: the current Gensim version
            - `python`: the current Python version
            - `platform`: the current platform
            - `event`: the name of this event
        log_level : int
            Also log the complete event dict, at the specified log level. Set to False to not log at all.

        r	   gensimpythonplatformeventlifecycle_eventsz2starting a new internal lifecycle event log for %sz%s lifecycle event %sN)r   r	   now	isoformatgensim_versionr7   versionr   hasattrr    r!   	__class__r"   r   logappend)r#   
event_name	log_levelr   
event_dicts        r   add_lifecycle_eventzSaveLoad.add_lifecycle_event  s    H e__
!)!9!9!;!;
:-
8"{
8!)!2!4!4
:(
7t/00 	'LLMt~Ofggg$&D! 	`JJy"94>;RT^___  	5!((44444	5 	5r   Nc                     t                               d| j        |           t                              |          \  }}t          |          }|                    ||||           |                    d|           |S )aa  Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.

        Parameters
        ----------
        fname : str
            Path to file that contains needed object.
        mmap : str, optional
            Memory-map option.  If the object was saved with large arrays stored separately, you can load these arrays
            via mmap (shared memory) using `mmap='r'.
            If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.

        See Also
        --------
        :meth:`~gensim.utils.SaveLoad.save`
            Save object to file.

        Returns
        -------
        object
            Object loaded from `fname`.

        Raises
        ------
        AttributeError
            When called on an object instance instead of class (this is a class method).

        zloading %s object from %sloaded)fname)r    infor"   r   _adapt_by_suffixunpickle_load_specialsr   )clsr   mmapcompresssubnameobjs         r   loadzSaveLoad.load  sx    : 	/uEEE$55e<<'uoo5$':::666
r   c           	         d }t          | dg           D ]}d                    ||f          }t                              d|||           t	                      5  t          | |                              ||||           ddd           n# 1 swxY w Y   t          | dg           D ]}t                              d| |||          |           |r=|r || |||                    t          j         |||                    d         }n t          j         |||          |	          }t	                      5  t          | ||           ddd           n# 1 swxY w Y   t          | d
g           D ]e}t                              d| |||          |           t           |||                    }	|rw|r || |||                    t          j         |||d                    5 }
|
d         |	_
        |
d         |	_        |
d         |	_        ddd           n# 1 swxY w Y   nrt          j         |||d          |	          |	_
        t          j         |||d          |	          |	_        t          j         |||d          |	          |	_        t	                      5  t          | ||	           ddd           n# 1 swxY w Y   gt          | dg           D ]T}t                              d|           t	                      5  t          | |d           ddd           n# 1 swxY w Y   UdS )uz  Load attributes that were stored separately, and give them the same opportunity
        to recursively load using the :class:`~gensim.utils.SaveLoad` interface.

        Parameters
        ----------
        fname : str
            Input file path.
        mmap :  {None, ‘r+’, ‘r’, ‘w+’, ‘c’}
            Memory-map options. See `numpy.load(mmap_mode)
            <https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html>`_.
        compress : bool
            Is the input file compressed?
        subname : str
            Attribute name. Set automatically during recursive processing.

        c                 4    t          d| d|ddz             S )NzCannot mmap compressed object z	 in file z. z:Use `load(fname, mmap=None)` or uncompress files manually.)IOError)r   filenames     r   
mmap_errorz+SaveLoad._load_specials.<locals>.mmap_error  s/    7CF33QNO  r   __recursive_saveloads.z-loading %s recursively from %s.* with mmap=%sN__numpyszloading %s from %s with mmap=%sval)	mmap_mode__scipyssparsedataindptrindices
__ignoredsz$setting ignored attribute %s to None)r   rJ   r    r   ignore_deprecation_warningr   r   r   setattrr   r   r   r   )r#   r   r   r   r   r   attribcfnamer   r   fs              r   r   zSaveLoad._load_specials  s   "	 	 	 d$;R@@ 	V 	VFXXufo..FKKGQWY]^^^+-- V Vf%%44VT8WUUUV V V V V V V V V V V V V V V dJ33 	+ 	+FKK96775RXCYCY[_``` F E$*VWWUF-C-CDDDgggeV4455e<gggeV44EEE+-- + +fc***+ + + + + + + + + + + + + + + dJ33 	. 	.FKK96775RXCYCY[_```ggeV4455F \ E$*VWWUF-C-CDDDWWWUFH==>> 2!"#F)FK$%hKFM%&y\FN2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 !gggeVV&D&DPTUUU "vx(H(HTX Y Y Y!#	)J)JVZ![![![+-- . .ff---. . . . . . . . . . . . . . . dL"55 	, 	,FKK>GGG+-- , ,fd+++, , , , , , , , , , , , , , ,	, 	,sZ   'BB	B	?EE!	$E!	6(H**H.	1H.	6KK	K	L;;L?	L?	c                 t    |                      d          s|                      d          rdnd\  }|fdfS )a  Get compress setting and filename for numpy file compression.

        Parameters
        ----------
        fname : str
            Input filename.

        Returns
        -------
        (bool, function)
            First argument will be True if `fname` compressed.

        .gz.bz2)Tnpz)Fnpyc                  6    d                     | fz             S )Nr   )rJ   )r$   suffixs    r   <lambda>z+SaveLoad._adapt_by_suffix.<locals>.<lambda>>  s    sxxy0@'A'A r   )endswith)r   r   r   s     @r   r   zSaveLoad._adapt_by_suffix.  sL     -2NN5,A,AoU^^TZE[E[o==ao&AAAAAAr   i   c           
      B   t                               |          \  }}|                     |||||||          }	 t          | ||           |D ]V\  }	}
|
                                D ]<\  }}t                      5  t          |	||           ddd           n# 1 swxY w Y   =Wn^# |D ]V\  }	}
|
                                D ]<\  }}t                      5  t          |	||           ddd           n# 1 swxY w Y   =Ww xY wt                              d|           dS )a  Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`.

        Parameters
        ----------
        fname : str
            Path to file.
        separately : list, optional
            Iterable of attributes than need to store distinctly.
        sep_limit : int, optional
            Limit for separation.
        ignore : frozenset, optional
            Attributes that shouldn't be store.
        pickle_protocol : int, optional
            Protocol number for pickle.

        Notes
        -----
        If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored,
        and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back
        on load efficiently.

        You can also set `separately` manually, in which case it must be a list of attribute names to be stored
        in separate files. The automatic check is not performed in this case.

        protocolNzsaved %s)	r   r   _save_specialspickleitemsr   r   r    r   )r#   r   
separately	sep_limitrs   pickle_protocolr   r   restoresr   asidesr   r   s                r   _smart_savezSaveLoad._smart_save@  s   : %55e<<'&&:y&/8W
 
	249999  ( 2 2V#)<<>> 2 2KFC355 2 2VS1112 2 2 2 2 2 2 2 2 2 2 2 2 2 222x 2 2V#)<<>> 2 2KFC355 2 2VS1112 2 2 2 2 2 2 2 2 2 2 2 2 2 222 	J&&&&&sA   B& :BBB&/DC3'D3C77D:C7;Dc                 x	   i }t           j        j        t           j        j        f}	|g }| j                                        D ]p\  }
}t          |t          j                  r!|j	        |k    r|
                    |
           @t          ||	          r |j        |k    r|
                    |
           qt                      5  |t          |          z   D ]5}
t          | |
          r#t          | |
          ||
<   t!          | |
           6	 ddd           n# 1 swxY w Y   g }g }| j                                        D ]o\  }
}t          |d          rZ|
                    |
           d                    ||
f          }|                    |                    |d|||||                     p	 g g g }}}|                                D ]\  }
}t          |t          j                  r|
|vr|
                    |
           t(                              d|
 |||
                     |r3t          j         |||
          t          j        |                     t          j         |||
          t          j        |                     t          |t           j        j        t           j        j        f          r^|
|vrY|
                    |
           t(                              d|
 |||
                     |r3t          j         |||
d          |j        |j        |j                   not          j         |||
d	          |j                   t          j         |||
d
          |j                   t          j         |||
d          |j                   |j        |j        |j        }}}d\  |_        |_        |_        	 t9          | |||
          |           |||c|_        |_        |_        :# |||c|_        |_        |_        w xY wt(                              d|
           |
                    |
           || j        d<   || j        d<   || j        d<   || j        d<   n:# t:          $ r- |                                D ]\  }
}t=          | |
|            w xY w|| |fgz   S )a  Save aside any attributes that need to be handled separately, including
        by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.

        Parameters
        ----------
        fname : str
            Output filename.
        separately : list or None
            List of attributes to store separately.
        sep_limit : int
            Don't store arrays smaller than this separately. In bytes.
        ignore : iterable of str
            Attributes that shouldn't be stored at all.
        pickle_protocol : int
            Protocol number for pickle.
        compress : bool
            If True - compress output with :func:`numpy.savez_compressed`.
        subname : function
            Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`

        Returns
        -------
        list of (obj, {attrib: value, ...})
            Settings that the caller should use to restore each object's attributes that were set aside
            during the default :func:`~gensim.utils.pickle`.

        Nr   r   zstoring np array '%s' to %s)r   z(storing scipy.sparse array '%s' under %sr   )r   r   r   r   r   r   )NNNr   znot storing attribute %sr   r   r   r   )scipyr   
csr_matrix
csc_matrix__dict__r   r   r   ndarraysizer   nnzr   listr   r   delattrrJ   extendr   r    r   savez_compressedascontiguousarraysaver   r   r   r   r5   r   )r#   r   r   r   rs   r   r   r   r   sparse_matricesr   r   recursive_saveloadsr   r   numpysscipysignoredsr   r   r   s                        r   r   zSaveLoad._save_specialsl  sS   8  <2EL4KL 	.J#}2244 . .c2:.. .38y3H .%%f----_55 .#'Y:N .%%f---')) 	* 	*$tF||3 * *4(( *%,T6%:%:F6ND&)))*	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* !=..00 	y 	yKFCs,-- y#**62225&/22 2 264FTcemov w wxxx0	')2rHFF%||~~ $, $,c2:.. #,63G #,MM&)))KK =vwwuV\G]G]^^^ S+GGE6,B,BH\]`HaHabbbbbv 6 68LS8Q8QRRRRel&=u|?V%WXX ,]ckq]q ,MM&)))KK JFT[T[\aciTjTjkkk 
P+#GE68<<!$#&:$'K	     vv > >IIIvx @ @#*MMMvy A A3;OOO,/Hcj#+'&D8H5CHcj#+RsGGE6$:$:_UUUU<@&'9#*ckkD&'9#*ckQQQQKK :FCCCOOF++++(.DM*%(.DM*%*2DM,'5HDM122 	 	 	%||~~ + +fc****		
 D&>***s9   A	DDD0HQ: P,Q: PAQ: :7R1c                 >   |                      dt          |          t          |          ||           	 t          j        | ||           t                              d| j        j                   dS # t          $ r | 	                    |||||           Y dS w xY w)a  Save the object to a file.

        Parameters
        ----------
        fname_or_handle : str or file-like
            Path to output file or already opened file-like object. If the object is a file handle,
            no special array handling will be performed, all attributes will be saved to the same file.
        separately : list of str or None, optional
            If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
            them into separate files. This prevent memory errors for large objects, and also allows
            `memory-mapping <https://en.wikipedia.org/wiki/Mmap>`_ the large arrays for efficient
            loading and sharing the large arrays in RAM between multiple processes.

            If list of str: store these attributes into separate files. The automated size check
            is not performed in this case.
        sep_limit : int, optional
            Don't store arrays smaller than this separately. In bytes.
        ignore : frozenset of str, optional
            Attributes that shouldn't be stored at all.
        pickle_protocol : int, optional
            Protocol number for pickle.

        See Also
        --------
        :meth:`~gensim.utils.SaveLoad.load`
            Load object from file.

        saving)fname_or_handler   r   rs   r   zsaved %s object)r   N)
r   r0   _pickledumpr    r   r   r"   	TypeErrorr   )r#   r   r   r   rs   r   s         r   r   zSaveLoad.save  s    @ 	  00: 	! 	
 	
 	
	nLIIIIKK)4>+BCCCCC 	n 	n 	n_j)V]lmmmmmm	ns   <A4 4$BBN)r"   
__module____qualname____doc__loggingINFOr   classmethodr   r   staticmethodr   	frozensetPICKLE_PROTOCOLr   r   r   r,   r   r   r   r     s          9@ 45 45 45 45l # # # [#JA, A, A,F B B \B& |IIKKYh*' *' *' *'Xf+ f+ f+T |IIKKYh+n +n +n +n +n +nr   r   c                     | S )zIdentity fnc, for flows that don't accept lambda (pickling etc).

    Parameters
    ----------
    p : object
        Input parameter.

    Returns
    -------
    object
        Same as `p`.

    r,   )ps    r   identityr     s	     Hr   c                 f    d}| D ]+}|r't          |t          d |D                                 },|S )a-  Get the highest feature id that appears in the corpus.

    Parameters
    ----------
    corpus : iterable of iterable of (int, numeric)
        Collection of texts in BoW format.

    Returns
    ------
    int
        Highest feature id.

    Notes
    -----
    For empty `corpus` return -1.

    c              3       K   | ]	\  }}|V  
d S r   r,   )rD   fieldidrl   s      r   rF   zget_max_id.<locals>.<genexpr>(  s&      "F"Fzw7"F"F"F"F"F"Fr   )max)corpusmaxiddocuments      r   
get_max_idr     sO    $ E H H 	Hs"F"FX"F"F"FFFGGELr   c                   D    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
dd
Zd	S )FakeDictzObjects of this class act as dictionaries that map integer->str(integer), for a specified
    range of integers <0, num_terms).

    This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory.

    c                     || _         dS )zf

        Parameters
        ----------
        num_terms : int
            Number of terms.

        N	num_terms)r#   r   s     r   __init__zFakeDict.__init__3  s     #r   c                 0    | j         j        d| j        dS )Nz<num_terms=>)r   r"   r   r#   s    r   __str__zFakeDict.__str__>  s    %)^%<%<%<dnnnMMr   c                     d|cxk    r| j         k     rn nt          |          S t          d|d| j         d          )Nr   zinternal id out of bounds (z, expected <0..z)))r   r0   r   r#   r   s     r   __getitem__zFakeDict.__getitem__A  sf     	 	 	 	dn 	 	 	 	 	s88OjsssTXTbTbTbcdddr   c                 ,    d|cxk    o
| j         k     nc S Nr   r   r  s     r   __contains__zFakeDict.__contains__F  s&    C(((($.(((((r   c              #   ^   K   t          | j                  D ]}|t          |          fV  dS )z~Iterate over all keys and values.

        Yields
        ------
        (int, str)
            Pair of (id, token).

        N)ranger   r0   )r#   is     r   	iteritemszFakeDict.iteritemsI  s@       t~&& 	 	ASVV)OOOO	 	r   c                     | j         dz
  gS )a  Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus,
        i.e. the vocabulary dimensionality.

        Returns
        -------
        list of int
            Highest id, packed in list.

        Notes
        -----
        To avoid materializing the whole `range(0, self.num_terms)`,
        this returns the highest id = `[self.num_terms - 1]` only.

           r   r  s    r   keyszFakeDict.keysU  s     "##r   c                     | j         S r   r   r  s    r   __len__zFakeDict.__len__f  s
    ~r   Nc                 N    d|cxk    r| j         k     rn nt          |          S |S r  )r   r0   )r#   r   defaults      r   getzFakeDict.geti  sH     	 	 	 	dn 	 	 	 	 	s88Or   r   )r"   r   r   r   r   r  r  r  r  r  r  r  r,   r   r   r   r   ,  s         	# 	# 	#N N Ne e e
) ) )
 
 
$ $ $"       r   r   c                 H    dt          |           z   }t          |          }|S )a  Scan corpus for all word ids that appear in it, then construct a mapping
    which maps each `word_id` -> `str(word_id)`.

    Parameters
    ----------
    corpus : iterable of iterable of (int, numeric)
        Collection of texts in BoW format.

    Returns
    ------
    id2word : :class:`~gensim.utils.FakeDict`
        "Fake" mapping which maps each `word_id` -> `str(word_id)`.

    Warnings
    --------
    This function is used whenever *words* need to be displayed (as opposed to just their ids)
    but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually
    used in the corpus, up to the highest `word_id` found.

    r  )r   r   )r   r   id2words      r   dict_from_corpusr  o  s(    * Jv&&&Iy!!GNr   c                    	 d| j         j        v rd| fS n# t          $ r Y nw xY w	 t          | d          st          | d          r&t	          |           }t          j        |g|           } nt	          t          |                     }t          |          dk    rd| fS t	          t          |                    \  }}t          |          t          |          }}n# t          $ r d| fcY S w xY wd| fS )aS  Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators.
    The peeked element is put back into a object returned by this function, so always use
    that returned object instead of the original `obj`.

    Parameters
    ----------
    obj : object
        An `iterable of iterable` that contains (int, numeric).

    Returns
    -------
    (bool, object)
        Pair of (is `obj` a corpus, `obj` with peeked element restored)

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import is_corpus
        >>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]]
        >>> corpus_or_not, corpus = is_corpus(corpus)

    Warnings
    --------
    An "empty" corpus (empty input sequence) is ambiguous, so in this case
    the result is forcefully defined as (False, `obj`).

    CorpusTnext__next__r   F)r   r"   r5   r   r  	itertoolschainiterrm   intfloat)r   doc1id1val1s       r   	is_corpusr$    s$   :s}-- 	9	   3 	#73
#;#; 	# 99D/4&#..CCS		??Dt99> 	9 d$$	THHeDkkT   cz9s$    
""A8C =C C.-C.c                     ddl } 	 ddlm}  |            }|                      | j        | j                  }|                    |j        j        |j        j        f           |	                                \  }}n# t          $ r 	 ddl}|                    d                              d          d                                         d         dd         }t          |                    d                    d	k    rt                      n7# t          $ r* |                     |                                           }Y nw xY wY nw xY w|S )
a}  Try to obtain our external ip (from the Pyro4 nameserver's point of view)

    Returns
    -------
    str
        IP address.

    Warnings
    --------
    This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration,
    which often mess up hostname resolution.
    If all else fails, fall back to simple `socket.gethostbyname()` lookup.

    r   N)locateNSifconfig
r     r   r   )socketPyro4.namingr&  AF_INET
SOCK_DGRAMconnect_pyroUrihostportgetsocknamer5   commands	getoutputsplitrm   gethostbynamegethostname)r*  r&  nssr'   r1  r3  s          r   	get_my_ipr:    se    MMM@))))))XZZMM&.&*;<<			2;#R[%56777}} 	@ 	@ 	@	@OOO''
3399$??BHHJJ1MabbQF6<<$$%%* "kk!" 	@ 	@ 	@))&*<*<*>*>??FFF	@	@ Ms7   A2A9 9
EBD
E1D?<E>D??EEc                       e Zd ZdZd Zd ZdS )RepeatCorpusa,  Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus`
    over and over again, until the requested length `len(result) == reps` is reached.
    Repetition is done on-the-fly=efficiently, via `itertools`.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import RepeatCorpus
        >>>
        >>> corpus = [[(1, 2)], []]  # 2 documents
        >>> list(RepeatCorpus(corpus, 5))  # repeat 2.5 times to get 5 documents
        [[(1, 2)], [], [(1, 2)], [], [(1, 2)]]

    c                 "    || _         || _        dS )z

        Parameters
        ----------
        corpus : iterable of iterable of (int, numeric)
            Input corpus.
        reps : int
            Number of repeats for documents from corpus.

        N)r   reps)r#   r   r>  s      r   r   zRepeatCorpus.__init__  s     			r   c                 d    t          j        t          j        | j                  | j                  S r   )r  islicecycler   r>  r  s    r   __iter__zRepeatCorpus.__iter__  s#    	 < <diHHHr   Nr"   r   r   r   r   rB  r,   r   r   r<  r<    sA           I I I I Ir   r<  c                       e Zd ZdZd Zd ZdS )RepeatCorpusNTimesaI  Wrap a `corpus` and repeat it `n` times.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import RepeatCorpusNTimes
        >>>
        >>> corpus = [[(1, 0.5)], []]
        >>> list(RepeatCorpusNTimes(corpus, 3))  # repeat 3 times
        [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]

    c                 "    || _         || _        dS )z

        Parameters
        ----------
        corpus : iterable of iterable of (int, numeric)
            Input corpus.
        n : int
            Number of repeats for corpus.

        N)r   n)r#   r   rG  s      r   r   zRepeatCorpusNTimes.__init__  s     r   c              #   T   K   t          | j                  D ]}| j        D ]}|V  d S r   )r
  rG  r   )r#   rl   r   s      r   rB  zRepeatCorpusNTimes.__iter__!  sH      tv 	 	A K  	 	r   NrC  r,   r   r   rE  rE    s<               r   rE  c                   &    e Zd ZdZddZd Zd ZdS )ClippedCorpusz5Wrap a `corpus` and return `max_doc` element from it.Nc                 "    || _         || _        dS )a  

        Parameters
        ----------
        corpus : iterable of iterable of (int, numeric)
            Input corpus.
        max_docs : int
            Maximum number of documents in the wrapped corpus.

        Warnings
        --------
        Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus
        to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus.

        N)r   max_docs)r#   r   rL  s      r   r   zClippedCorpus.__init__)  s       r   c                 @    t          j        | j        | j                  S r   )r  r@  r   rL  r  s    r   rB  zClippedCorpus.__iter__<  s    T];;;r   c                 P    t          | j        t          | j                            S r   )minrL  rm   r   r  s    r   r  zClippedCorpus.__len__?  s    4=#dk"2"2333r   r   r"   r   r   r   r   rB  r  r,   r   r   rJ  rJ  '  sL        ??! ! ! !&< < <4 4 4 4 4r   rJ  c                   $    e Zd ZdZd Zd Zd ZdS )SlicedCorpusz'Wrap `corpus` and return a slice of it.c                 0    || _         || _        d| _        dS )an  

        Parameters
        ----------
        corpus : iterable of iterable of (int, numeric)
            Input corpus.
        slice_ : slice or iterable
            Slice for `corpus`.

        Notes
        -----
        Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over.
        Slice can also be a np.ndarray to support fancy indexing.

        Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has
        to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory.

        N)r   slice_length)r#   r   rT  s      r   r   zSlicedCorpus.__init__E  s    & r   c                 "    t           j        d          r@t           j        j                  dk    r# fd j        j         j                 D             S t          j         j         j        j         j        j         j        j	                  S )Nindexr   c              3   L   K   | ]}j                             |          V  d S r   )r   docbyoffset)rD   r  r#   s     r   rF   z(SlicedCorpus.__iter__.<locals>.<genexpr>^  s3      WW1DK++A..WWWWWWr   )
r   r   rm   rW  rT  r  r@  startstopstepr  s   `r   rB  zSlicedCorpus.__iter__\  s    4;(( 	XS1B-C-Ca-G 	XWWWW8I$+8VWWWWT[->@PRVR]Rbcccr   c                    | j         t          | j        t          t          j        f          rt          | j                  | _         nt          | j        t                    rO| j                            t          | j	        j
                            \  }}}||z
  }||z  ||z  dk    z   | _         nt          d | D                       | _         | j         S )Nr   c              3      K   | ]}d V  dS )r  Nr,   )rD   xs     r   rF   z'SlicedCorpus.__len__.<locals>.<genexpr>k  s"      !2!2!!2!2!2!2!2!2r   )rU  r   rT  r   r   r   rm   slicer   r   rW  sum)r#   rZ  endr\  diffs        r   r  zSlicedCorpus.__len__a  s    ; 	3$+bj'9:: 3!$+..DK// 3%)[%8%8T[=N9O9O%P%P"TU{"dldTkAo>!!2!2T!2!2!222{r   NrP  r,   r   r   rR  rR  C  sJ        11  .d d d
    r   rR  c                 v    	 t          |           S # t          $ r d| z  }|                    d          cY S w xY w)a:  Create a unicode character from its integer value. In case `unichr` fails, render the character
    as an escaped `\U<8-byte hex value of intval>` string.

    Parameters
    ----------
    intval : int
        Integer code of character

    Returns
    -------
    string
        Unicode string of character

    z\U%08xzunicode-escape)chrr   rH   )intvalr9  s     r   safe_unichrrg  p  sQ    *6{{ * * *xx()))))	*s    $88c                 >    d }t                               ||           S )u  Decode all HTML entities in text that are encoded as hex, decimal or named entities.
    Adapted from `python-twitter-ircbot/html_decode.py
    <http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py>`_.

    Parameters
    ----------
    text : str
        Input HTML.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import decode_htmlentities
        >>>
        >>> u = u'E tu vivrai nel terrore - L&#x27;aldil&#xE0; (1981)'
        >>> print(decode_htmlentities(u).encode('UTF-8'))
        E tu vivrai nel terrore - L'aldilà (1981)
        >>> print(decode_htmlentities("l&#39;eau"))
        l'eau
        >>> print(decode_htmlentities("foo &lt; bar"))
        foo < bar

    c                    	 |                      d          }|                      d          dk    rk|                      d          dk    rt          t          |                    S |                      d          dv rt          t          |d                    S d S t          j        |          }|rt          |          S |                                  S # t
          $ r |                                  cY S w xY w)N   r  #rh   r?   )r_  X   )rf   rg  r  n2cpr  r5   )rg   entcps      r   substitute_entityz.decode_htmlentities.<locals>.substitute_entity  s    	!++a..C{{1~~$ );;q>>R' 5&s3xx000[[^^z1 5&s3||4445 5
 Xc]] )&r??* ;;==( 	! 	! 	!;;==   	!s$   A"C %3C $C  C C54C5)RE_HTML_ENTITYsub)rK   rq  s     r   decode_htmlentitiesrt    s*    2! ! !, /666r   c              #   .  K   t          |           }	 |r0fdt          j        |t          |                    D             g}n0t	          t          j        |t          |                              g}|d         sdS |                                V  )a  Yield elements from `iterable` in "chunksize"-ed groups.

    The last returned element may be smaller if the length of collection is not divisible by `chunksize`.

    Parameters
    ----------
    iterable : iterable of object
        An iterable.
    chunksize : int
        Split iterable into chunks of this size.
    as_numpy : bool, optional
        Yield chunks as `np.ndarray` instead of lists.

    Yields
    ------
    list OR np.ndarray
        "chunksize"-ed chunks of elements from `iterable`.

    Examples
    --------
    .. sourcecode:: pycon

        >>> print(list(grouper(range(10), 3)))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

    Tc                 <    g | ]}t          j        |           S ))dtype)r   array)rD   rt   rw  s     r   rr   z#chunkize_serial.<locals>.<listcomp>  s(    iiiSbhs%888iiir   r   N)r  r  r@  r  r   pop)iterable	chunksizeas_numpyrw  itwrapped_chunks      `  r   chunkize_serialr    s      6 
hB
" 	I jiiiIDTUWY\]fYgYgDhDhiiijMM!)"22s9~~"F"FGGHMQ 	E!!!!!
"r   c                   (     e Zd ZdZ fdZd Z xZS )
InputQueuezPopulate a queue of input chunks from a streamed corpus.

    Useful for reading and chunking corpora in the background, in a separate process,
    so that workers that use the queue are not starved for input chunks.

    c                     t          t          |                                            || _        || _        || _        || _        || _        dS )a  
        Parameters
        ----------
        q : multiprocessing.Queue
            Enqueue chunks into this queue.
        corpus : iterable of iterable of (int, numeric)
            Corpus to read and split into "chunksize"-ed groups
        chunksize : int
            Split `corpus` into chunks of this size.
        as_numpy : bool, optional
            Enqueue chunks as `numpy.ndarray` instead of lists.

        N)superr  r   qmaxsizer   r{  r|  )r#   r  r   r{  r  r|  r   s         r   r   zInputQueue.__init__  sH     	j$((***" r   c                    t          | j                  }	 t          j        || j                  }| j        rd |D             g}nt          |          g}|d         s| j                            d d           d S 	 | j        	                                }n# t          $ r d}Y nw xY wt                              dt          |d                   |           | j                            |                                d           )NTc                 6    g | ]}t          j        |          S r,   )r   asarray)rD   rt   s     r   rr   z"InputQueue.run.<locals>.<listcomp>
  s     !C!C!Cc"*S//!C!C!Cr   r   block?z1prepared another chunk of %i documents (qsize=%s))r  r   r  r@  r{  r|  r   r  putqsizeNotImplementedErrorr    r!   rm   ry  )r#   r}  chunkr~  r  s        r   runzInputQueue.run  s   $+	8$R88E} . "D!CU!C!C!C D!%e # 

4t
,,,&   LLLcR_`aRbNcNcejkkkFJJ}((**$J777'	8s   <B B%$B%)r"   r   r   r   r   r  __classcell__)r   s   @r   r  r    sQ         ! ! ! ! !*8 8 8 8 8 8 8r   r  ntdarwin)rj     c              #      K   |dk    r+t           j        dk    rdnd}t          j        d|z             t	          | ||          D ]}|V  dS )a<  Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.

        Parameters
        ----------
        corpus : iterable of object
            An iterable.
        chunksize : int
            Split `corpus` into chunks of this size.
        maxsize : int, optional
            Ignored. For interface compatibility only.
        as_numpy : bool, optional
            Yield chunks as `np.ndarray` s instead of lists?

        Yields
        ------
        list OR np.ndarray
            "chunksize"-ed chunks of elements from `corpus`.

        r   r  WindowszOSX with python3.8+z1detected %s; aliasing chunkize to chunkize_serialr|  N)rQ   namewarningswarnr  )r   r{  r  r|  entityr  s         r   chunkizer    st      ( Q; 	X"$'T/LYY7LFMMPVVWWW$VYJJJ 	 	EKKKK	 	r   c              #   R  K   |dk    sJ |dk    r}t          j        |          }t          || |||          }d|_        |                                 	 |                    d          g}|d         dS |                                V  8t          | ||          D ]}|V  dS )a  Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.

        Parameters
        ----------
        corpus : iterable of object
            An iterable.
        chunksize : int
            Split `corpus` into chunks of this size.
        maxsize : int, optional
            If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`.
        as_numpy : bool, optional
            Yield chunks as `np.ndarray` instead of lists?

        Yields
        ------
        list OR np.ndarray
            "chunksize"-ed chunks of elements from `corpus`.

        Notes
        -----
        Each chunk is of length `chunksize`, except the last one which may be smaller.
        A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools.

        If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue
        (of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process,
        and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium
        like HDD, database or network.

        If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize
        via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations).

        Yields
        ------
        list of object OR np.ndarray
            Groups based on `iterable`

        r   )r  )r  r|  Tr  Nr  )multiprocessingQueuer  daemonrZ  r  ry  r  )r   r{  r  r|  r  workerr  s          r   r  r  9  s      L 1}Q; 	%g666A69gPXYYYF FMLLNNN"T**+8 Eiikk!!!	" )XNNN   r   c                     t           j                            |           \  } }|                    d          r| |dd         z   |z   dz   } n1|                    d          r| |dd         z   |z   dz   } n| |z   |z   } | S )a2  Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or
    `.gz` (if any) at the end.

    Parameters
    ----------
    fname : str
        Filename or full path.
    ext : str
        Extension to append before any compression extensions.

    Returns
    -------
    str
        New path to file with `ext` appended.

    Examples
    --------

    .. sourcecode:: pycon

        >>> from gensim.utils import smart_extension
        >>> smart_extension("my_file.pkl.gz", ".vectors")
        'my_file.pkl.vectors.gz'

    r   Nr   )rQ   pathsplitextr   )r   extoexts      r   smart_extensionr  p  s    4 '""5))KE4}}V #SbS	!C'&0	u		 #SbS	!C'%/s"Lr   c                     t          |d          5 }t          j        | ||           ddd           dS # 1 swxY w Y   dS )a$  Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.

    Parameters
    ----------
    obj : object
        Any python object.
    fname : str
        Path to pickle file.
    protocol : int, optional
        Pickle protocol number.

    wbr   N)r
   r   r   )r   r   r   fouts       r   r   r     s     
eT		 3dS$22223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3s   6::c                     t          | d          5 }t          j        |d          cddd           S # 1 swxY w Y   dS )zLoad object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.

    Parameters
    ----------
    fname : str
        Path to pickle file.

    Returns
    -------
    object
        Python object loaded from `fname`.

    r/   latin1)r`   N)r
   r   r   )r   r   s     r   r   r     s     
eT		 2a|A1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2s   488c                 X    d t          |                                           D             S )a  Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`.

    Parameters
    ----------
    d : dict
        Input dictionary.

    Returns
    -------
    dict
        Reversed dictionary mapping.

    Notes
    -----
    When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import revdict
        >>> d = {1: 2, 3: 4}
        >>> revdict(d)
        {2: 1, 4: 3}

    c                     i | ]\  }}||	S r,   r,   )rD   kvs      r   
<dictcomp>zrevdict.<locals>.<dictcomp>  s    ///VaAq///r   )dictr   )ds    r   revdictr    s&    6 0/tAww}}////r   c                 $    t           t                    r fd}|S t          j                   st          j                   r! dt                    fd            }|S t          t          t                                         )a^  Decorator to mark functions as deprecated.

    Calling a decorated function will result in a warning being emitted, using warnings.warn.
    Adapted from https://stackoverflow.com/a/40301488/8001386.

    Parameters
    ----------
    reason : str
        Reason of deprecation.

    Returns
    -------
    function
        Decorated function

    c                 H     dt                      fd            }|S )Nz'Call to deprecated `{name}` ({reason}).c                      t          j                            j                  t          d            | i |S )N)r  reasonrh   rC   
stacklevelr  r  formatr"   DeprecationWarning)r$   r%   fmtr(   r  s     r   	new_func1z0deprecated.<locals>.decorator.<locals>.new_func1  sN    JJDM&JAA/    
 tT,V,,,r   r   )r(   r  r  r  s   ` @r   	decoratorzdeprecated.<locals>.decorator  sE    ;C4[[- - - - - - [- r   zCall to deprecated `{name}`.c                  ~    t          j                            j                  t          d            | i |S )N)r  rh   r  r  )r$   r%   r  r(   s     r   	new_func2zdeprecated.<locals>.new_func2  sL    M


..+   
 4((((r   )	r   r0   inspectisclass
isfunctionr   r   reprtype)r  r  r  r  r(   s   `  @@r   
deprecatedr    s    " &# ,	 	 	 	 	 		 	  ,G$6v$>$> ,,	t	) 	) 	) 	) 	) 
	)  T&\\**+++r   c               #      K   t          j                    5  t          j        dt                     dV  ddd           dS # 1 swxY w Y   dS )z/Contextmanager for ignoring DeprecationWarning.rs   )rC   N)r  catch_warningsfilterwarningsr  r,   r   r   r   r   	  s       
	 	"	"  3EFFFF                 s    AA
Az!Function will be removed in 4.0.0
   c                 ~    ||          }t          t          |          d           }fd|d|         D             S )am  Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`),
    to see if they are actually related to the query.

    Parameters
    ----------
    query : {list of (int, number), numpy.ndarray}
        vector OR BoW (list of tuples)
    texts : str
        object that can return something insightful for each document via `texts[docid]`,
        such as its fulltext or snippet.
    index : any
        A instance from from :mod:`gensim.similarity.docsim`.

    Return
    ------
    list
        a list of 3-tuples (docid, doc's similarity to the query, texts[docid])

    c                     | d          S )Nr  r,   )items    r   r   ztoptexts.<locals>.<lambda>'  s    T!WH r   )keyc                 ,    g | ]\  }}|||         fS r,   r,   )rD   topid	topcosinetextss      r   rr   ztoptexts.<locals>.<listcomp>)  s)    NNN1A	UIuU|,NNNr   N)sorted	enumerate)queryr  rW  rG  simss    `   r   toptextsr    sL    * <D)D//'<'<===DNNNNT"1"XNNNNr   r   c                     t          t          j        dd                    dd         }t          j                            t          j                    | |z             S )zGenerate a random filename in temp.

    Parameters
    ----------
    prefix : str
        Prefix of filename.

    Returns
    -------
    str
        Full path in the in system's temporary folder, ending in a random filename.

    r    rh   N)hexr   randintrQ   r  rJ   tempfile
gettempdir)prefixrandparts     r   	randfnamer  ,  sJ     6>!X..//3H7<<+--v/@AAAr     c                 .   d}t          ||          D ]}|t          |          z   }t                              d||dz
             |5g }|D ].} ||d                   |d<   |d= |                    |           /|}|                     |           |}dS )aO  Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).

    Notes
    -----
    Use this function to train or index large collections.abc -- avoid sending the
    entire corpus over the wire as a single Pyro in-memory object. The documents
    will be sent in smaller chunks, of `chunksize` documents each.

    r   zuploading documents %i-%ir  NrK   ru   )grouperrm   r    r   r   buffer)	serverdocsr{  
preprocessrZ  r  rb  pchunkrt   s	            r   upload_chunkedr  >  s     Ey))  c%jj /a@@@ 	F # # *
3v; 7 7HKc""""Ee r   Tc                     ddl }	 |                    | |||          S # |j        j        $ r t	          d          w xY w)a  Get a Pyro4 name server proxy.

    Parameters
    ----------
    host : str, optional
        Name server hostname.
    port : int, optional
        Name server port.
    broadcast : bool, optional
        Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network)
    hmac_key : str, optional
        Private key.

    Raises
    ------
    RuntimeError
        When Pyro name server is not found.

    Returns
    -------
    :class:`Pyro4.core.Proxy`
        Proxy from Pyro4.

    r   NzPyro name server not found)Pyro4r&  rZ   NamingErrorRuntimeError)r0  r1  	broadcasthmac_keyr  s        r   getNSr  X  sY    2 LLL9~~dD)X>>><# 9 9 978889s    =c                 "   |i }|r0| dt          t          j        dd                    dd         z   z  } ddl}t	          di |5 }|                    |pt                      |pd          5 }|                    ||           }	|                    |            |                    | |	           t          
                    d| |	           |                                 ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )zRegister an object with the Pyro name server.

    Start the name server if not running yet and block until the daemon is terminated.
    The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set.

    Nr   r   r  rh   z(%s registered with nameserver (URI '%s')r,   )r  r   r  r  r  Daemonr:  registerremover    r   requestLoop)
r  r   random_suffixipr1  ns_confr  r8  r  uris
             r   pyro_daemonr  x  s      ;c&.H5566qrr:::LLL					 !R\\"+	TYQ77 	!6//#t,,CIIdOOOKKc"""KKBD#NNN   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !s7   'D.A2C, D,C0	0D3C0	4DDD      ?      ?c                     t           j                            | f          fdt          |           D             S )a  Create a random gensim BoW vector, with the feature counts following the Poisson distribution.

    Parameters
    ----------
    dim : int, optional
        Dimension of vector.
    prob_nnz : float, optional
        Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution.
    lam : float, optional
        Lambda parameter for the Poisson distribution.

    Returns
    -------
    list of (int, float)
        Vector in BoW format.

    )r   c                     g | ]@}|         k     |t          t          j                                       dz             fAS ))lamr   )r   r   r   poisson)rD   r  r  r   prob_nnzs     r   rr   z!mock_data_row.<locals>.<listcomp>  sP    bbbQPSTUPVYaPabQbi''C'0036778bbbr   )r   r   uniformr
  )dimr  r  r   s    ``@r   mock_data_rowr    sG    $ )

#

(
(Cbbbbbb%**bbbbr   c                 B    fdt          |           D             S )a}  Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`.

    Parameters
    ----------
    n_items : int
        Size of corpus
    dim : int
        Dimension of vector, used for :func:`~gensim.utils.mock_data_row`.
    prob_nnz : float, optional
        Probability of each coordinate will be nonzero, will be drawn from Poisson distribution,
        used for :func:`~gensim.utils.mock_data_row`.
    lam : float, optional
        Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`.

    Returns
    -------
    list of list of (int, float)
        Gensim-style corpus.

    c                 4    g | ]}t                     S ))r  r  r  )r  )rD   rl   r  r  r  s     r   rr   zmock_data.<locals>.<listcomp>  s(    WWW1McH#>>>WWWr   )r
  )n_itemsr  r  r  s    ```r   	mock_datar    s.    * XWWWWWgWWWWr   c           	         d}t          |           }t          |           D ](}t          || |         ||          s|| |         z  }| |= )t                              d|t          |           z
  ||t          |                      |S )a  Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.

    Modifies `vocab` in place, returns the sum of all counts that were pruned.

    Parameters
    ----------
    vocab : dict
        Input dictionary.
    min_reduce : int
        Frequency threshold for tokens in `vocab`.
    trim_rule : function, optional
        Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`.

    Returns
    -------
    result : int
        Sum of all counts that were pruned.

    r   z:pruned out %i tokens with count <=%i (before %i, after %i))rm   r   keep_vocab_itemr    r   )vocab
min_reduce	trim_ruler'   old_lenws         r   prune_vocabr    s    ( F%jjG%[[  q%(J	BB 	eAhFa
KKD#e**j'3u::   Mr   c                     |t          |           k    rdS t          j        ||                                           d         }t	          | ||           dS )a  Retain `topk` most frequent words in `vocab`.
    If there are more words with the same frequency as `topk`-th one, they will be kept.
    Modifies `vocab` in place, returns nothing.

    Parameters
    ----------
    vocab : dict
        Input dictionary.
    topk : int
        Number of words with highest frequencies to keep.
    trim_rule : function, optional
        Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`.

    Nr   )r  )rm   heapqnlargestvaluesr  )r  topkr  	min_counts       r   trim_vocab_by_freqr    sU     s5zz tU\\^^44R8IyI666666r   c                 n    |                                 D ]\  }}|| v r| |xx         |z  cc<   || |<    | S )a]  Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2).
    Parameters
    ----------
    dict1 : dict of (str, int)
        First dictionary.
    dict2 : dict of (str, int)
        Second dictionary.
    Returns
    -------
    result : dict
        Merged dictionary with sum of frequencies as values.
    )r   )dict1dict2wordfreqs       r   merge_countsr!    sS     kkmm  
d5= 	$KKK4KKKKE$KKLr   c                 N    	 |                                  S # t          $ r Y dS w xY w)zGet the (approximate) queue size where available.

    Parameters
    ----------
    queue : :class:`queue.Queue`
        Input queue.

    Returns
    -------
    int
        Queue size, -1 if `qsize` method isn't implemented (OS X).

    r   )r  r  )queues    r   r  r  
  s7    {{}}   rrs    
$$r  c                 h    ||k    }||S  || ||          }|t           k    rdS |t          k    rdS |S )a6  Should we keep `word` in the vocab or remove it?

    Parameters
    ----------
    word : str
        Input word.
    count : int
        Number of times that word appeared in a corpus.
    min_count : int
        Discard words with frequency smaller than this.
    trim_rule : function, optional
        Custom function to decide whether to keep or discard this word.
        If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`.

    Returns
    -------
    bool
        True if `word` should stay, False otherwise.

    NTF)	RULE_KEEPRULE_DISCARD)r  countr  r  default_resrule_ress         r   r  r  $  sY    * 9$K 	9T5)44y  	4% 	5r   c                    	 t                               d||           t          j        |d| i|}|                                \  }}|                                }|r=|                    d          }||d         }t          j        ||          }||_        ||S # t          $ r |
                                  w xY w)a  Run OS command with the given arguments and return its output as a byte string.

    Backported from Python 2.7 with a few minor modifications. Used in word2vec/glove2word2vec tests.
    Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import check_output
        >>> check_output(args=['echo', '1'])
        '1\n'

    Raises
    ------
    KeyboardInterrupt
        If Ctrl+C pressed.

    zCOMMAND: %s %sstdoutr$   Nr   )r    r!   
subprocessPopencommunicatepollr  CalledProcessErroroutputKeyboardInterrupt	terminate)	r+  	popenargsr%   processr1  
unused_errretcodecmderrors	            r   check_outputr:  G  s    (%y&999"9G&GGG$0022
,,.. 	**V$$C #l1'3??E!ELK   s   BB  B=c                      |r=t          j        t                     t          t	                     |                    n&t          j                                         |          } fd|D             S )a  Selected `n` (possibly random) items from the dictionary `d`.

    Parameters
    ----------
    d : dict
        Input dictionary.
    n : int, optional
        Number of items to select.
    use_random : bool, optional
        Select items randomly (without replacement), instead of by the natural dict iteration order?

    Returns
    -------
    list of (object, object)
        Selected items from dictionary, as a list.

    c                 $    g | ]}||         fS r,   r,   )rD   r  r  s     r   rr   zsample_dict.<locals>.<listcomp>  s!    333cS!C&M333r   )r   sampler   rO  rm   r  r@  r  )r  rG  
use_randomselected_keyss   `   r   sample_dictr@  m  sj    $ ?IkFM$q''3s1vvq>>:::iN^_`_e_e_g_gijNkNkM3333]3333r   c                 R   t          j        |           } || j        d         k    rt          j        | g          S || j        d         k    rt          j        d          S | j        d         }t           j        j                            | | j        d         |z
  dz   |f||f          S )a  Produce a numpy.ndarray of windows, as from a sliding window.

    Parameters
    ----------
    ndarray : numpy.ndarray
        Input array
    window_size : int
        Sliding window size.

    Returns
    -------
    numpy.ndarray
        Subsequences produced by sliding a window of the given size over the `ndarray`.
        Since this uses striding, the individual arrays are views rather than copies of `ndarray`.
        Changes to one view modifies the others and the original.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.utils import strided_windows
        >>> strided_windows(np.arange(5), 2)
        array([[0, 1],
               [1, 2],
               [2, 3],
               [3, 4]])
        >>> strided_windows(np.arange(10), 5)
        array([[0, 1, 2, 3, 4],
               [1, 2, 3, 4, 5],
               [2, 3, 4, 5, 6],
               [3, 4, 5, 6, 7],
               [4, 5, 6, 7, 8],
               [5, 6, 7, 8, 9]])

    r   )r   r   r  )shapestrides)	r   r  rB  rx  r   rC  libstride_tricks
as_strided)r   window_sizestrides      r   strided_windowsrI    s    H j!!GgmA&& "x	"""	w}Q'	' "z&!!!_QF6**a(;6:KH  + " " "r   c              #   x   K   t          |           D ]'\  }}t          ||||          D ]}|r||fV  |V  (dS )ak  Produce a generator over the given texts using a sliding window of `window_size`.

    The windows produced are views of some subsequence of a text.
    To use deep copies instead, pass `copy=True`.

    Parameters
    ----------
    texts : list of str
        List of string sentences.
    window_size : int
        Size of sliding window.
    copy : bool, optional
        Produce deep copies.
    ignore_below_size : bool, optional
        Ignore documents that are not at least `window_size` in length?
    include_doc_num : bool, optional
        Yield the text position with `texts` along with each window?

    N)r  _iter_windows)r  rG  copyignore_below_sizeinclude_doc_numdoc_numr   windows           r   iter_windowsrQ    sx      ( 'u--  #Hk4ARSS 	 	F '''''		 r   c              #      K   t          | |          }|j        d         dk    r |s|r|                                 n| V  d S d S |D ]}|r|                                n|V  d S r  )rI  rB  rL  )r   rG  rL  rM  doc_windows
doc_windows         r   rK  rK    s      !(K88Kq  <  	8%)7(--///x77777	8 	8 & 	< 	<J'+;*//###;;;;	< 	<r   c                 :    t          t          |                     S )a  Recursively flatten a nested sequence of elements.

    Parameters
    ----------
    nested_list : iterable
        Possibly nested sequence of elements to flatten.

    Returns
    -------
    list
        Flattened version of `nested_list` where any elements that are an iterable (`collections.abc.Iterable`)
        have been unpacked into the top-level list, in a recursive fashion.

    )r   lazy_flatten)nested_lists    r   flattenrX    s     [))***r   c              #      K   | D ]Q}t          |t          j        j                  r,t          |t                    st          |          D ]}|V  M|V  RdS )zLazy version of :func:`~gensim.utils.flatten`.

    Parameters
    ----------
    nested_list : list
        Possibly nested list.

    Yields
    ------
    object
        Element of list

    N)r   collectionsabcIterabler0   rX  )rW  elrs  s      r   rV  rV    s{         b+/233 	Jr3<O<O 	r{{  				 HHHH r   c                     t          |dd          5 }| D ]<}t          d                    |          dz             }|                    |           =	 ddd           dS # 1 swxY w Y   dS )zSave the corpus in LineSentence format, i.e. each sentence on a separate line,
    tokens are separated by space.

    Parameters
    ----------
    corpus : iterable of iterables of strings

    r  r=   )moder`    r(  N)r
   r{   rJ   write)r   r   r  sentencelines        r   save_as_line_sentencerd    s     
hTF	3	3	3 t 	 	Hsxx11D899DJJt	                 s   A A!!A%(A%c                     | dk    rt          d          | dS | dk     r't          t          j                    dz   | z   d          } | S )a0  Determines the number of jobs can run in parallel.

    Just like in sklearn, passing n_jobs=-1 means using all available
    CPU cores.

    Parameters
    ----------
    n_jobs : int
        Number of workers requested by caller.

    Returns
    -------
    int
        Number of effective jobs.

    r   z&n_jobs == 0 in Parallel has no meaningNr  )r   r   r  	cpu_count)n_jobss    r   effective_n_jobsrh    sb    " { BABBB	 Bq	! B_.0014v=qAAMr   c                    t           j                            |           r| j        d         dk    S t	          | t
          j                  rdS 	 t          t          |                     }dS # t          $ r Y dS t          $ r Y dS w xY w)z:Is the corpus (an iterable or a scipy.sparse array) empty?r  r   FT)r   r   issparserB  r   typesGeneratorTyper  r  StopIterationr5   )r   	first_docs     r   is_emptyro  *  s    |V$$ $|A!##&%-.. uf&&	u   tt   uus   A, ,
B9	BB)FFr=   rW   FF)Frh   ri   )rW   r=   )r=   rW   )r   F)r  )r   )r  N)NNTN)FNNN)r  r  r   )r  r  r  r   r   )r  T)FTF)FT)yr   
__future__r   
contextlibr   collections.abcrZ  r   r  r   html.entitiesr   rn  r   r   rerB   rQ   r   r  r  	functoolsr   r  rO   r7   r,  r  r  rL  r   r	   r   rk  numpyr   scipy.sparser   
smart_openr
   r   r   r   	getLoggerr"   r    r   compileUNICODErd   rr  r  	NO_CYTHONdefault_rngdefault_prngr   r-   r3   r;   rM   rV   rb   r]   rv   ry   to_utf8r{   r[   r~   r   r   r   r   r  r$  r:  r<  rE  rJ  rR  rg  rt  float32r  r  Processr  r  version_infor  r  r   r  r  r   r  r  r  r  r  r  r  r  r  r!  r  RULE_DEFAULTr&  r%  r  PIPEr:  r@  rI  rQ  rK  rX  rV  rd  rh  ro  r,   r   r   <module>r     su   ) ( % % % % % % % % % % % %        0 0 0 0 0 0     				     				                  



                                   0 0 0 0 0 0		8	$	$ .
;;5rzBBL; 	 S y$$&&Z Z ZB  .  . + + +:0 0 0<  .(! (! (! (!V  $   := = = =2 . . . .. 
L L L&yn yn yn yn yn yn yn ynx  "  2@ @ @ @ @ @ @ @F  43 3 3l! ! !HI I I I I8 I I ID       D4 4 4 4 4H 4 4 48* * * * *8 * * *Z* * *0/7 /7 /7d 38rz &" &" &" &"R 18 18 18 18 18( 18 18 18p 7d? Os|x/ OC4D4N O    44 4 4 4n" " "J !0 3 3 3 3"2 2 2$0 0 0<0, 0, 0,f    /00O O O 10O4B B B B$ /00   1029 9 9 9@! ! ! !.c c c c,X X X X0   B7 7 7 7,  ,  * 	       F # # # # #L4 4 4 4,-" -" -"`   8< < < <+ + +$  ,    4    r   