
    c1U                        d Z ddlZddlZddlZddlZddlZddlZdZdZ	 ej
        e          Z ej        d          Z ej        d          Zg dZg dZ ej        d          Zed	k    r ej        ej                  Zned
k    r ej        ej                  ZndZd Z e e e                                Z ej        de          Z	 d Zd dZd!dZ d"dZ!d Z"d#dZ#d Z$d Z%d Z&d Z'd Z(d Z)d Z*d Z+d Z,d Z-dS )$u  Load models from the native binary format released by Facebook.

The main entry point is the :func:`~gensim.models._fasttext_bin.load` function.
It returns a :class:`~gensim.models._fasttext_bin.Model` namedtuple containing everything loaded from the binary.

Examples
--------

Load a model from a binary file:

.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>> from gensim.models.fasttext_bin import load
    >>> with open(datapath('crime-and-punishment.bin'), 'rb') as fin:
    ...     model = load(fin)
    >>> model.nwords
    291
    >>> model.vectors_ngrams.shape
    (391, 5)
    >>> sorted(model.raw_vocab, key=lambda w: len(w), reverse=True)[:5]
    ['останавливаться', 'изворачиваться,', 'раздражительном', 'exceptionally', 'проскользнуть']

See Also
--------

`FB Implementation <https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc>`_.

    N       iO/))dimi)wsr   epochr   	min_countr   negr   word_ngramsr   lossr   modelr   bucketr   minnr   maxnr   lr_update_rater   td)r   r
   r   r   r   r   r   r   r   r   r   z@f      c               #      K   t           t          z   D ]\  } }|                     d          s| V  dV  dV  dV  dV  dV  dV  d S )N_	raw_vocab
vocab_sizenwordsvectors_ngramshidden_outputntokens)_OLD_HEADER_FORMAT_NEW_HEADER_FORMAT
startswith)namer"   s     ;lib/python3.11/site-packages/gensim/models/_fasttext_bin.py_yield_field_namesr.   k   s      %(::  as## 	JJJ


NNN


OOOOO    Modelc                 z    t          j        |          }t          j        ||                     |                    S )N)structcalcsizeunpackread)finfmt	num_bytess      r-   _struct_unpackr9      s/    $$I=chhy11222r/   utf-8c                 d   t          | d          \  }}}|dk    rt          d          t                              d|| j                   t          | d          d         }|rt          | d          \  }t          j                    }t          |          D ]}	t          j	                    }
| 
                    d          }|t          k    r5|
                    |           | 
                    d          }|t          k    5|
                                }
	 |
                    |          }nC# t          $ r6 |
                    |d          }t                              d	|
|           Y nw xY wt          | d
          \  }}|||<   |r"t          |          D ]}t          | d           ||||fS )a`  Load a vocabulary from a FB binary.

    Before the vocab is ready for use, call the prepare_vocab function and pass
    in the relevant parameters from the model.

    Parameters
    ----------
    fin : file
        An open file pointer to the binary.
    new_format: boolean
        True if the binary is of the newer format.
    encoding : str
        The encoding to use when decoding binary data into words.

    Returns
    -------
    tuple
        The loaded vocabulary.  Keys are words, values are counts.
        The vocabulary size.
        The number of words.
        The number of tokens.
    z@3ir   z,Supervised fastText models are not supportedz+loading %s words for fastText model from %sz@q   backslashreplace)errorszQfailed to decode invalid unicode bytes %r; replacing invalid characters, using %rz@qb@2i)r9   NotImplementedErrorloggerinfor,   collectionsOrderedDictrangeioBytesIOr5   _END_OF_WORD_MARKERwritegetvaluedecodeUnicodeDecodeErrorerror)r6   
new_formatencodingr$   r%   nlabelsr(   pruneidx_sizer#   r   
word_bytes	char_bytewordcountr"   js                   r-   _load_vocabrW      s   . #1e"<"<J { R!"PQQQ
KK=z38TTTS$''*G 3'T22'))I:    Z\\
HHQKK	.. 	$Y'''I .. 	$  ((**
	$$X..DD! 	 	 	$$X6H$IIDLLcD    	 "#u--q	$ '}%% 	' 	'A3&&&&j&'11s   D,,=E,+E,Tc                    t           t          dt          z            |rt          | d           t          | d          \  }}||z  }t	          | t
          j                  r1t                              d           t          | t           |          }nt          j        | t           |          }|j        |fk    sJ d|d|j                    |                    ||f          }|S )aA  Load a matrix from fastText native format.

    Interprets the matrix dimensions and type from the file stream.

    Parameters
    ----------
    fin : file
        A file handle opened for reading.
    new_format : bool, optional
        True if the quant_input variable precedes
        the matrix declaration.  Should be True for newer versions of fastText.

    Returns
    -------
    :class:`numpy.array`
        The vectors as an array.
        Each vector will be a row in the array.
        The number of columns of the array will correspond to the vector size.

    Nzbad _FLOAT_SIZE: %r@?@2qzLoading model from a compressed .gz file.  This can be slow. This is a work-around for a bug in NumPy: https://github.com/numpy/numpy/issues/13470. Consider decompressing your model file for a faster load. z
expected (z	,),  got )_FLOAT_DTYPE
ValueError_FLOAT_SIZEr9   
isinstancegzipGzipFilerA   warning	_fromfilenpfromfileshapereshape)r6   rN   num_vectorsr   rU   matrixs         r-   _load_matrixri      s    *  >.<=== "sD!!!%c511K#E #t}%% 7I	
 	
 	

 3e44S,66<E8#VVV%%%%VVVV^^[#.//FMr/       .Ac              #      K   ||k    r't          | d|z            }|D ]}|V  ||z  }||k    't          | d|z            }|D ]}|V  dS )zRead `count` floats from `fin`.

    Batches up read calls to avoid I/O overhead.  Keeps no more than batch_size
    floats in memory at once.

    Yields floats.

    z@%dfNr9   )r6   rU   
batch_sizebatchfs        r-   _batched_generatorrp   !  s       *
 sFZ$788 	 	AGGGG	 *
  3//E   r/   c                 J    t          j        t          | |          |          S )z#Reimplementation of numpy.fromfile.)dtype)rc   fromiterrp   )r6   rr   rU   s      r-   rb   rb   5  s"    ;)#u55UCCCCr/   c                 l    t           t                    rt           d           t           d          \  }}|t          k    }|rt
          nt          } fd|D             }|s|                    ||           t           ||          \  }}	}
}|                    ||	|
|           t           |          }|sd}n3t           |          } 
                                d	k    s
J d
            |                    ||           d |                                D             }t          di |S )a  Load a model from a binary stream.

    Parameters
    ----------
    fin : file
        The readable binary stream.
    encoding : str, optional
        The encoding to use for decoding text
    full_model : boolean, optional
        If False, skips loading the hidden output matrix.  This saves a fair bit
        of CPU time and RAM, but prevents training continuation.

    Returns
    -------
    :class:`~gensim.models._fasttext_bin.Model`
        The loaded model.

    rbr?   c                 D    i | ]\  }}|t          |          d          S )r   rl   ).0r,   r7   r6   s      r-   
<dictcomp>zload.<locals>.<dictcomp>T  s.    NNN;D#T>#s++A.NNNr/   )r   r   )rO   )r#   r$   r%   r(   )rN   Nr/   zexpected to reach EOF)r&   r'   c                 ,    i | ]\  }}|t           v ||S  )_FIELD_NAMES)rw   kvs      r-   rx   zload.<locals>.<dictcomp>e  s(    AAAdaqL/@AQAAAr/   rz   )r^   stropenr9   _FASTTEXT_FILEFORMAT_MAGICr*   r)   updaterW   ri   r5   itemsr0   )r6   rO   
full_modelmagicversionrN   header_specr   r#   r$   r%   r(   r&   r'   s   `             r-   loadr   :  sY   & #s 3oo#C//NE744J(2J$$8JKNNNN+NNNE ,7+++-8jS[-\-\-\*Iz67	LL9FT[L\\\!#*===N :$SZ@@@xxzzS 99"9999	LLmLLLLAAekkmmAAAE>>5>>r/   c                     | j         | j        | j        }}}d                    d |||         D                       }||fS )a|  Replace byte sequences that failed to decode with character escapes.

    Does the same thing as errors="backslashreplace" from Python 3.  Python 2
    lacks this functionality out of the box, so we need to backport it.

    Parameters
    ----------
    ex: UnicodeDecodeError
        contains arguments of the string and start/end indexes of the bad portion.

    Returns
    -------
    text: unicode
        The Unicode string corresponding to the decoding of the bad section.
    end: int
        The index from which to continue decoding.

    Note
    ----
    Works on Py2 only.  Py3 already has backslashreplace built-in.

     c              3   Z   K   | ]&}d                      t          |                    V  'dS )z\x{:02x}N)formatord)rw   cs     r-   	<genexpr>z-_backslashreplace_backport.<locals>.<genexpr>  s6      HH1K&&s1vv..HHHHHHr/   )objectstartendjoin)exbstrr   r   texts        r-   _backslashreplace_backportr   i  sI    6 y"(BF%D88HHU3YHHHHHD9r/   c                     |                      t                                                     |                      t                                                     dS )a|  
    Write signature of the file in Facebook's native fastText `.bin` format
    to the binary output stream `fout`. Signature includes magic bytes and version.

    Name mimics original C++ implementation, see
    [FastText::signModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc)

    Parameters
    ----------
    fout: writeable binary stream
    N)rI   r   tobytes_FASTTEXT_VERSION)fouts    r-   _sign_modelr     sH     	JJ)1133444JJ ((**+++++r/   c                     |dk    r&t          j        |                                           S |dk    r&t          j        |                                           S t	          d|z            )a  
    Auxiliary function that converts `field_value` to bytes based on request `field_type`,
    for saving to the binary file.

    Parameters
    ----------
    field_value: numerical
        contains arguments of the string and start/end indexes of the bad portion.

    field_type: str
        currently supported `field_types` are `i` for 32-bit integer and `d` for 64-bit float
    r   r   z6Currently conversion to "%s" type is not implemmented.)rc   int32r   float64r@   )field_value
field_types     r-   _conv_field_to_bytesr     sm     S i%%--//0	s	 i
;''//112!"Z]g"ghhhr/   c                    |dk    r| j         j        S |dk    r| j        S |dk    r| j        S |dk    r6| j        dk    rdS | j        dk    rdS | j        dk    r| j        dk    rdS dS dS |dk    r| j         j        S |d	k    r| j         j        S |d
k    r| j        S |dk    r| j	        dk    rdndS |dk    r| j        S |dk    r| j
        S |dk    rdS |dk    r| j        S |dk    rdS d|z   dz   }t          |          )z
    Extract `field` from `model`.

    Parameters
    ----------
    model: gensim.models.fasttext.FastText
        model from which `field` is extracted
    field: str
        requested field name, fields are listed in the `_NEW_HEADER_FORMAT` list
    r   r   r	   r   r<   r      r   r   r   r   r   r   r   r   r   d   zExtraction of header field "z/" from Gensim FastText object not implemmented.N)wvr   vector_sizeepochshsnegativemax_nmin_nr   sgsamplewindowr@   )r   fieldmsgs      r-   _get_field_from_modelr     s     ,'x	% *'  	'	 ('|	& &' 8q= 	1X] 	1X] 	u~2 	1	 	 	 		& 'x~	& 'x~	+	 '	'	 '
 HM(qqq(	% '~	# '|	-	 
'q	$ '|	"	" 's,u47hh!#&&&r/   c                     t           D ]E\  }}||v r	||         }nt          ||          }|                     t          ||                     FdS )a  
    Saves header with `model` parameters to the binary stream `fout` containing a model in the Facebook's
    native fastText `.bin` format.

    Name mimics original C++ implementation, see
    [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)

    Parameters
    ----------
    fout: writeable binary stream
        stream to which model is saved
    model: gensim.models.fasttext.FastText
        saved model
    fb_fasttext_parameters: dictionary
        dictionary contain parameters containing `lr_update_rate`, `word_ngrams`
        unused by gensim implementation, so they have to be provided externally
    N)r*   r   rI   r   )r   r   fb_fasttext_parametersr   r   r   s         r-   
_args_saver     so    $ 0 B Bz** 	>07KK/u==K

'Z@@AAAAB Br/   c                    |                      t          j        t          |j                                                                       |                      t          j        t          |j                                                                       |                      t          j        d                                                     |                      t          j        |j                                                             |                      t          j        d                     |j        j        D ]}|j        	                    |d          }|                      |
                    |                     |                      t                     |                      t          j        |                                                     |                      t                     dS )aI  
    Saves the dictionary from `model` to the to the binary stream `fout` containing a model in the Facebook's
    native fastText `.bin` format.

    Name mimics the original C++ implementation
    [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)

    Parameters
    ----------
    fout: writeable binary stream
        stream to which the dictionary from the model is saved
    model: gensim.models.fasttext.FastText
        the model that contains the dictionary to save
    encoding: str
        string encoding used in the output
    r   rU   N)rI   rc   r   lenr   r   int64corpus_total_wordsindex_to_keyget_vecattrencoderH   _DICT_WORD_ENTRY_TYPE_MARKER)r   r   rO   rT   
word_counts        r-   
_dict_saver     sw   0 	JJrxEH&&..00111JJrxEH&&..00111 	JJrx{{""$$%%%JJrx01199;;<<< 	JJrx||% 1 1X))$88


4;;x(()))

&'''

28J''//11222

/00001 1r/   c                    |j         j        j        \  }}|j         j        j        \  }}||k    sJ |t	          |j                   k    sJ ||j         j        k    sJ |                     t          j        d||z   |                     |                     |j         j        	                                           |                     |j         j        	                                           dS )a  
    Saves word and ngram vectors from `model` to the binary stream `fout` containing a model in
    the Facebook's native fastText `.bin` format.

    Corresponding C++ fastText code:
    [DenseMatrix::save](https://github.com/facebookresearch/fastText/blob/master/src/densematrix.cc)

    Parameters
    ----------
    fout: writeable binary stream
        stream to which the vectors are saved
    model: gensim.models.fasttext.FastText
        the model that contains the vectors to save
    rZ   N)
r   vectors_vocabre   r&   r   r   rI   r2   packr   )r   r   vocab_n	vocab_dimngrams_n
ngrams_dims         r-   _input_saver   0  s     /5GY 828Hj
""""c%(mm####ux&&&&JJv{5'H"4i@@AAAJJux%--//000JJux&..0011111r/   c                     |j         r|j        }|j        r|j        }|j        \  }}|                     t          j        d||                     |                     |                                           dS )a  
    Saves output layer of `model` to the binary stream `fout` containing a model in
    the Facebook's native fastText `.bin` format.

    Corresponding C++ fastText code:
    [DenseMatrix::save](https://github.com/facebookresearch/fastText/blob/master/src/densematrix.cc)

    Parameters
    ----------
    fout: writeable binary stream
        the model that contains the output layer to save
    model: gensim.models.fasttext.FastText
        saved model
    rZ   N)	r   syn1r   syn1negre   rI   r2   r   r   )r   r   r'   hidden_n
hidden_dims        r-   _output_saver   K  sw     x #
~ &(.HjJJv{5(J77888JJ}$$&&'''''r/   c                 H   t          |           t          || |           t          || |           |                    t	          j        dd                     t          ||            |                    t	          j        dd                     t          ||            dS )aY  
    Saves word embeddings to binary stream `fout` using the Facebook's native fasttext `.bin` format.

    Parameters
    ----------
    fout: file name or writeable binary stream
        stream to which the word embeddings are saved
    model: gensim.models.fasttext.FastText
        the model that contains the word embeddings to save
    fb_fasttext_parameters: dictionary
        dictionary contain parameters containing `lr_update_rate`, `word_ngrams`
        unused by gensim implementation, so they have to be provided externally
    encoding: str
        encoding used in the output file
    rY   FN)r   r   r   rI   r2   r   r   r   )r   r   r   rO   s       r-   _save_to_streamr   d  s    " tU2333tUH%%%JJv{4''((( eJJv{4''((( ur/   c                     t          |t                    r=t          |d          5 }t          | |||           ddd           dS # 1 swxY w Y   dS t          | |||           dS )a  
    Saves word embeddings to the Facebook's native fasttext `.bin` format.

    Parameters
    ----------
    fout: file name or writeable binary stream
        stream to which model is saved
    model: gensim.models.fasttext.FastText
        saved model
    fb_fasttext_parameters: dictionary
        dictionary contain parameters containing `lr_update_rate`, `word_ngrams`
        unused by gensim implementation, so they have to be provided externally
    encoding: str
        encoding used in the output file

    Notes
    -----
    Unfortunately, there is no documentation of the Facebook's native fasttext `.bin` format

    This is just reimplementation of
    [FastText::saveModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc)

    Based on v0.9.1, more precisely commit da2745fcccb848c7a225a7d558218ee4c64d5333

    Code follows the original C++ code naming.
    wbN)r^   r~   r   r   )r   r   r   rO   fout_streams        r-   saver     s    8 $ G$ 	RE;0FQQQ	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	R 	t%;XFFFFFs   AA
A
)r:   )T)rj   )r:   T).__doc__rC   r_   rF   loggingr2   numpyrc   rH   r   	getLogger__name__rA   r   r   r   r*   r)   r3   r]   rr   float32r[   r   r.   sortedsetr{   
namedtupler0   r9   rW   ri   rp   rb   r   r   r   r   r   r   r   r   r   r   r   rz   r/   r-   <module>r      s   <      				         '  
	8	$	$
 BHRLL %RXi00         fod##! 28BJ''LLA 28BJ''LLL	 	 	 vcc,,..//00w55*Z3 3 3
<2 <2 <2 <2~4 4 4 4n   (D D D
, , , ,^  @, , , i i i*7' 7' 7'tB B B4)1 )1 )1\2 2 26( ( (2  < G  G  G  G  Gr/   