
    c:                         d Z ddlZddlmZmZ  ej        e          Z G d dej                  Z	 G d de	          Z
 G d d	ej                  Z G d
 dej                  ZdS )ao  Basic interfaces used across the whole Gensim package.

These interfaces are used for building corpora, model transformation and similarity queries.

The interfaces are realized as abstract base classes. This means some functionality is already
provided in the interface itself, and subclasses should inherit from these interfaces
and implement the missing methods.

    N)utilsmatutilsc                   F     e Zd ZdZd Z fdZd Zedd            Z xZ	S )		CorpusABCa3  Interface for corpus classes from :mod:`gensim.corpora`.

    Corpus is simply an iterable object, where each iteration step yields one document:

    .. sourcecode:: pycon

        >>> from gensim.corpora import MmCorpus  # inherits from the CorpusABC class
        >>> from gensim.test.utils import datapath
        >>>
        >>> corpus = MmCorpus(datapath("testcorpus.mm"))
        >>> for doc in corpus:
        ...     pass  # do something with the doc...

    A document represented in the bag-of-word (BoW) format, i.e. list of (attr_id, attr_value),
    like ``[(1, 0.2), (4, 0.6), ...]``.

    .. sourcecode:: pycon

        >>> from gensim.corpora import MmCorpus  # inherits from the CorpusABC class
        >>> from gensim.test.utils import datapath
        >>>
        >>> corpus = MmCorpus(datapath("testcorpus.mm"))
        >>> doc = next(iter(corpus))
        >>> print(doc)
        [(0, 1.0), (1, 1.0), (2, 1.0)]

    Remember that the save/load methods only pickle the corpus object, not
    the (streamed) corpus data itself!
    To save the corpus data, please use this pattern :

    .. sourcecode:: pycon

        >>> from gensim.corpora import MmCorpus  # MmCorpus inherits from CorpusABC
        >>> from gensim.test.utils import datapath, get_tmpfile
        >>>
        >>> corpus = MmCorpus(datapath("testcorpus.mm"))
        >>> tmp_path = get_tmpfile("temp_corpus.mm")
        >>>
        >>> MmCorpus.serialize(tmp_path, corpus)  # serialize corpus to disk in the MmCorpus format
        >>> loaded_corpus = MmCorpus(tmp_path)  # load corpus through constructor
        >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus):
        ...     assert doc_1 == doc_2  # no change between the original and loaded corpus


    See Also
    --------
    :mod:`gensim.corpora`
        Corpora in different formats.

    c                      t          d          )zIterate all over corpus.&cannot instantiate abstract base classNotImplementedErrorselfs    1lib/python3.11/site-packages/gensim/interfaces.py__iter__zCorpusABC.__iter__L   s    !"JKKK    c                 z    ddl }|                    d            t          t          |           j        |i | dS )ag  Saves the in-memory state of the corpus (pickles the object).

        Warnings
        --------
        This saves only the "internal state" of the corpus object, not the corpus data!

        To save the corpus data, use the `serialize` method of your desired output format
        instead, e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`.

        r   Nzcorpus.save() stores only the (tiny) iteration object in memory; to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus))warningswarnsuperr   save)r   argskwargsr   	__class__s       r   r   zCorpusABC.saveP   sT     	Z	
 	
 	
 	$i#T4V44444r   c                      t          d          )z:Get the corpus size = the total number of documents in it.z2must override __len__() before calling len(corpus)r	   r   s    r   __len__zCorpusABC.__len__b   s    !"VWWWr   NFc                      t          d          )aw  Save `corpus` to disk.

        Some formats support saving the dictionary (`feature_id -> word` mapping),
        which can be provided by the optional `id2word` parameter.

        Notes
        -----
        Some corpora also support random access via document indexing, so that the documents on disk
        can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class).

        In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by
        :func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index
        at the same time.

        Calling :func:`serialize() is preferred to calling :meth:`gensim.interfaces.CorpusABC.save_corpus`.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of list of (int, number)
            Corpus in BoW format.
        id2word : :class:`~gensim.corpora.Dictionary`, optional
            Dictionary of corpus.
        metadata : bool, optional
            Write additional metadata to a separate too?

        r   r	   )fnamecorpusid2wordmetadatas       r   save_corpuszCorpusABC.save_corpusf   s    < ""JKKKr   )NF)
__name__
__module____qualname____doc__r   r   r   staticmethodr   __classcell__)r   s   @r   r   r      s        1 1dL L L5 5 5 5 5$X X X L L L \L L L L Lr   r   c                   ,    e Zd ZdZddZd Zd Zd ZdS )TransformedCorpuszQInterface for corpora that are the result of an online (streamed) transformation.Nc                     |||c| _         | _        | _        |                                D ]\  }}t	          | j         ||           d| _        dS )a  

        Parameters
        ----------
        obj : object
            A transformation :class:`~gensim.interfaces.TransformationABC` object that will be applied
            to each document from `corpus` during iteration.
        corpus : iterable of list of (int, number)
            Corpus in bag-of-words format.
        chunksize : int, optional
            If provided, a slightly more effective processing will be performed by grouping documents from `corpus`.

        FN)objr   	chunksizeitemssetattrr   )r   r)   r   r*   r   keyvalues          r   __init__zTransformedCorpus.__init__   sW     14VY-$+t~ ,,.. 	* 	*JCDHc5))))r   c                 *    t          | j                  S )zGet corpus size.)lenr   r   s    r   r   zTransformedCorpus.__len__   s    4;r   c              #      K   | j         rGt          j        | j        | j                   D ]%}| j                            |d          D ]}|V  &dS | j        D ]}| j        |         V  dS )a#  Iterate over the corpus, applying the selected transformation.

        If `chunksize` was set in the constructor, works in "batch-manner" (more efficient).

        Yields
        ------
        list of (int, number)
            Documents in the sparse Gensim bag-of-words format.

        N)r*   )r*   r   grouperr   r)   __getitem__)r   chunktransformeddocs       r   r   zTransformedCorpus.__iter__   s       > 	$t{DNCC & &#'8#7#7#7#N#N & &K%%%%%&& & { $ $hsm####$ $r   c                     t          | j        d          r| j        | j        |                  S t          d                    t          | j                                      )aY  Transform the document at position `docno` within `corpus` specified in the constructor.

        Parameters
        ----------
        docno : int
            Position of the document to transform. Document offset inside `self.corpus`.

        Notes
        -----
        `self.corpus` must support random indexing.

        Returns
        -------
        list of (int, number)
            Transformed document in the sparse Gensim bag-of-words format.

        Raises
        ------
        RuntimeError
            If corpus doesn't support index slicing (`__getitem__` doesn't exists).

        r4   z!Type {} does not support slicing.)hasattrr   r)   RuntimeErrorformattype)r   docnos     r   r4   zTransformedCorpus.__getitem__   sT    . 4;.. 	^8DK.//BII$t{J[J[\\]]]r   N)r    r!   r"   r#   r/   r   r   r4    r   r   r'   r'      s`        [[   (     $ $ $&^ ^ ^ ^ ^r   r'   c                        e Zd ZdZd ZddZdS )TransformationABCa  Transformation interface.

    A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`)
    and returns another sparse document in its stead:

    .. sourcecode:: pycon

        >>> from gensim.models import LsiModel
        >>> from gensim.test.utils import common_dictionary, common_corpus
        >>>
        >>> model = LsiModel(common_corpus, id2word=common_dictionary)
        >>> bow_vector = model[common_corpus[0]]  # model applied through __getitem__ on one document from corpus.
        >>> bow_corpus = model[common_corpus]  # also, we can apply model on the full corpus

    c                      t          d          )a  Transform a single document, or a whole corpus, from one vector space into another.

        Parameters
        ----------
        vec : {list of (int, number), iterable of list of (int, number)}
            Document in bag-of-words, or streamed corpus.

        r   r	   )r   vecs     r   r4   zTransformationABC.__getitem__        ""JKKKr   Nc                      t          | ||fi |S )a  Apply the transformation to a whole corpus and get the result as another corpus.

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in sparse Gensim bag-of-words format.
        chunksize : int, optional
            If provided, a more effective processing will performed.

        Returns
        -------
        :class:`~gensim.interfaces.TransformedCorpus`
            Transformed corpus.

        )r'   )r   r   r*   r   s       r   _applyzTransformationABC._apply   s      !vyCCFCCCr   r>   )r    r!   r"   r#   r4   rF   r?   r   r   rA   rA      sJ         	L 	L 	LD D D D D Dr   rA   c                   *    e Zd ZdZd Zd Zd Zd ZdS )SimilarityABCa  Interface for similarity search over a corpus.

    In all instances, there is a corpus against which we want to perform the similarity search.
    For each similarity search, the input is a document or a corpus, and the output are the similarities
    to individual corpus documents.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.similarities import MatrixSimilarity
        >>> from gensim.test.utils import common_corpus
        >>>
        >>> index = MatrixSimilarity(common_corpus)
        >>> similarities = index.get_similarities(common_corpus[1])  # get similarities between query and corpus

    Notes
    -----
    There is also a convenience wrapper, where iterating over `self` yields similarities of each document in the corpus
    against the whole corpus (i.e. the query is each corpus document in turn).

    See Also
    --------
    :mod:`gensim.similarities`
        Different index implementations of this interface.

    c                      t          d          )z

        Parameters
        ----------
        corpus : iterable of list of (int, number)
            Corpus in sparse Gensim bag-of-words format.

        &cannot instantiate Abstract Base Classr	   )r   r   s     r   r/   zSimilarityABC.__init__  rD   r   c                      t          d          )a%  Get similarities of the given document or corpus against this index.

        Parameters
        ----------
        doc : {list of (int, number), iterable of list of (int, number)}
            Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.

        rJ   r	   )r   r7   s     r   get_similaritieszSimilarityABC.get_similarities&  rD   r   c                     t          j        |          \  }} j        r7t          j        |          s#|rd |D             }nt          j        |          }                     |          } j        |S t           dd          rt          j	        | j                  S t          j        |          r fd|D             S t          j
        | j                  S )a  Get similarities of the given document or corpus against this index.

        Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.

        Notes
        -----
        Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
        because it will issue queries in batches internally.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.

        Returns
        -------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        c                 6    g | ]}t          j        |          S r?   )r   unitvec).0vs     r   
<listcomp>z-SimilarityABC.__getitem__.<locals>.<listcomp>N  s#    @@@QX-a00@@@r   Nmaintain_sparsityFc                 D    g | ]}t          j        |j                  S r?   )r   full2sparse_clippednum_best)rP   rQ   r   s     r   rR   z-SimilarityABC.__getitem__.<locals>.<listcomp>^  s(    SSSqH0DMBBSSSr   )r   	is_corpus	normalizer   ismatrixrO   rL   rV   getattrscipy2scipy_clippedrU   )r   queryrW   results   `   r   r4   zSimilarityABC.__getitem__1  s    * !?511	5> 		4
 $U++ 4 4@@%@@@EE$,U33E&&u--= 	M 4,e44 	G/FFF V$$ 	GSSSSFSSSS /FFFr   c              #   ~  K   | j         }d| _         	 | j        dk    }n# t          $ r d}Y nw xY w|rpt          d| j        j        d         | j                  D ]H}t          | j        j        d         || j        z             }| j        ||         }| |         D ]}|V  In| j        D ]}| |         V  || _         dS )a   Iterate over all documents, compute similarity of each document against all other documents in the index.

        Yields
        ------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarity of the current document and all documents in the corpus.

        F   r   N)rX   r*   AttributeErrorrangeindexshapemin)r   normchunkingchunk_start	chunk_endr5   simr7   s           r   r   zSimilarityABC.__iter__c  s
      ~	~)HH 	 	 	HHH	  	   %Q
(8(;T^LL    
 0 3[4>5QRR	
;	#9:;  CIIII z    3i s    --N)r    r!   r"   r#   r/   rL   r4   r   r?   r   r   rH   rH      sd         6	L 	L 	L	L 	L 	L0G 0G 0Gd+ + + + +r   rH   )r#   logginggensimr   r   	getLoggerr    loggerSaveLoadr   r'   rA   rH   r?   r   r   <module>ro      s3     " " " " " " " " 
	8	$	$kL kL kL kL kL kL kL kL\G^ G^ G^ G^ G^	 G^ G^ G^T+D +D +D +D +D +D +D +D\O O O O OEN O O O O Or   