
    c1                         d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
  ej        e          ZddZ	 	 ddZ G d	 d
          Zd Zd Zd Zd Zd ZdS )uQ  This module contains functions to compute confirmation on a pair of words or word subsets.

Notes
-----
The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and
:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing
brands of cars, which semantically support each other. However, both brands are seldom mentioned
together in documents in the reference corpus. But their confirmations to other words like “road”
or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure.
Thus, indirect confirmation measures may capture semantic support that direct measures would miss.

The formula used to compute indirect confirmation measure is

.. math::

    \widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*}))


where :math:`s_{sim}` can be cosine, dice or jaccard similarity and

.. math::

    \vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

    N)aggregate_segment_simslog_ratio_measureFc                    g }d}t          |           D ]\  }}g }d}	|D ]j\  }
}t          |
d          s|
g}
t          |d          s|g}	 |                    |                    |
|                     V# t          $ r |	dz  }	Y gw xY w|	dk    r!|dz  }t
                              d|	|           |                    t          |||                     |dk    rt
                              d|           |S )u  For each topic segmentation, compute average cosine similarity using a
    :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.

    Parameters
    ----------
    segmented_topics : list of lists of (int, `numpy.ndarray`)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets
        in addition to the mean coherence for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as
        the number of pairwise similarity comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list of (float[, float[, int]])
        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).

    Examples
    --------
    .. sourcecode:: pycon

        >>> import numpy as np
        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure
        >>> from gensim.topic_coherence import text_analysis
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>>
        >>> # should be (0.726752426218 0.00695475919227)
        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]

    r   __iter__   z:%d terms for topic %d are not in word2vec model vocabularyz1%d terms for are not in word2vec model vocabulary)	enumeratehasattrappendids_similarityZeroDivisionErrorloggerwarningr   )segmented_topicsaccumulatorwith_stdwith_supporttopic_coherences	total_oovtopic_indextopic_segmentssegment_simsnum_oovw_primew_stars               Tlib/python3.11/site-packages/gensim/topic_coherence/indirect_confirmation_measure.pyword2vec_similarityr   ,   sU   Z I'01A'B'B ^ ^#^- 		 		OGV7J// $")6:.. " ##K$>$>w$O$OPPPP$   1 Q; 	&NINNL& & & 	 6|X| \ \]]]]1} WJIVVVs   )A55BBnlrr   c                    t          ||||          }g }t          ||           D ]\  }	}
t          |	          }	t          j        t          |
                    }t          |
          D ]/\  }\  }}|||	f         }|||	f         }t          ||          ||<   0|                    t          |||                     |S )a  Calculate the indirect cosine measure.

    Parameters
    ----------
    segmented_topics: list of lists of (int, `numpy.ndarray`)
        Output from the segmentation module of the segmented topics.
    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
    measure : str, optional
        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma: float, optional
        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
    with_std : bool
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic; default is False.
    with_support : bool
        True to also include support across topic segments. The support is defined as the number of pairwise similarity
        comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list
        List of indirect cosine similarity measure for each topic.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>>
        >>> # create topics
        >>> topics = [np.array([1, 2])]
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
        >>> print(obtained[0])
        0.623018926945

    )
ContextVectorComputerziptuplenpzeroslenr   _cossimr
   r   )r   r   topicsmeasuregammar   r   context_vectorsr   topic_wordsr   r   ir   r   
w_prime_cv	w_star_cvs                    r   cosine_similarityr.   v   s    f ,GV[%PPO'*63C'D'D ^ ^#^K((xN 3 344$-n$=$= 	= 	= A (+)=>J'(;<I%j)<<LOO 6|X| \ \]]]]    c                   *    e Zd ZdZd Zd Zd Zd ZdS )r   a0  Lazily compute context vectors for topic segments.

    Parameters
    ----------
    measure: str
        Confirmation measure.
    topics: list of numpy.array
        Topics.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator from probability_estimation.
    gamma: float
        Value for computing vectors.

    Attributes
    ----------
    sim_cache: dict
        Cache similarities between tokens (pairs of word ids), e.g. (1, 2).
    context_vector_cache: dict
        Mapping from (segment, topic_words) --> context_vector.

    Example
    -------
    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>>
        >>> # create measure, topics
        >>> measure = 'nlr'
        >>> topics = [np.array([1, 2])]
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1)
        >>> cont_vect_comp.mapping
        {1: 0, 2: 1}
        >>> cont_vect_comp.vocab_size
        2

    c                     |dk    rt           | _        nt          d          t          |          | _        t          | j                  | _        || _        || _        i | _	        i | _
        d S )Nr   zGThe direct confirmation measure you entered is not currently supported.)
_pair_npmi
similarity
ValueError_map_to_contiguousmappingr$   
vocab_sizer   r(   	sim_cachecontext_vector_cache)selfr'   r&   r   r(   s        r   __init__zContextVectorComputer.__init__   sw    e 	[(DOOY[ [ [ *&11dl++&
$&!!!r/   c                      | j         | S N)compute_context_vector)r:   idxs     r   __getitem__z!ContextVectorComputer.__getitem__   s    *t*C00r/   c                     t          ||          }| j                            |d          }| |                     ||          }|| j        |<   |S )a  Check if (segment_word_ids, topic_word_ids) context vector has been cached.

        Parameters
        ----------
        segment_word_ids: list
            Ids of words in segment.
        topic_word_ids: list
            Ids of words in topic.
        Returns
        -------
        csr_matrix :class:`~scipy.sparse.csr`
            If context vector has been cached, then return corresponding context vector,
            else compute, cache, and return.

        N)_key_for_segmentr9   get	_make_seg)r:   segment_word_idstopic_word_idskeycontext_vectors        r   r>   z,ContextVectorComputer.compute_context_vector   s[      /@@266sDAA 	<!^^,<nMMN-;D%c*r/   c                 p   t          j        | j        df          }t          |d          s|f}|D ]q| j                 df}fd|D             D ]Q}|| j        vr#|                     || j                  | j        |<   ||xx         | j        |         | j        z  z  cc<   Rr|	                                S )a  Return context vectors for segmentation (Internal helper function).

        Parameters
        ----------
        segment_word_ids : iterable or int
            Ids of words in segment.
        topic_word_ids : list
            Ids of words in topic.
        Returns
        -------
        csr_matrix :class:`~scipy.sparse.csr`
            Matrix in Compressed Sparse Row format

        r   r   r   c              3   T   K   | ]"}t          t          |f                    V  #d S r=   )r!   sorted).0w_iw_js     r   	<genexpr>z2ContextVectorComputer._make_seg.<locals>.<genexpr>&  s7      OOsvsCj1122OOOOOOr/   )
sps
lil_matrixr7   r	   r6   r8   r3   r   r(   tocsr)r:   rE   rF   rH   r?   pairrN   s         @r   rD   zContextVectorComputer._make_seg  s     !(<=='44 	3 02! 	J 	JC<$a(COOOO>NOOO J Jt~- S+/??4AQ+R+RDN4(s###t~d';tz'II####	J ##%%%r/   N)__name__
__module____qualname____doc__r;   r@   r>   rD    r/   r   r   r      s[        , ,\' ' '1 1 1  .& & & & &r/   r   c                 4    t          | gg|d          d         S )a  Compute normalized pairwise mutual information (**NPMI**) between a pair of words.

    Parameters
    ----------
    pair : (int, int)
        The pair of words (word_id1, word_id2).
    accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator from probability_estimation.

    Return
    ------
    float
        NPMI between a pair of words.

    Tr   )r   )rS   r   s     r   r2   r2   /  s      tfX{D99!<<r/   c                     | j                             |          d         t          |           t          |          z  z  S )N)r   r   )Tdot
_magnitude)cv1cv2s     r   r%   r%   B  s/    599S>>$:c??Z__#DEEr/   c                 ^    t          j        t          j        | j        dz                      S )N   )r"   sqrtsumdata)
sparse_vecs    r   r]   r]   F  s#    726*/Q.//000r/   c                 n    i }d}t           j                            |           D ]}||vr
|||<   |dz  }|S )Nr   r   )	itertoolschainfrom_iterable)ids_iterableuniq_idsnid_s       r   r5   r5   J  sP    H	A,,\::  h 	HSMFAOr/   c                 L    t          | d          rt          |           n| }||fS )z:A segment may have a single number of an iterable of them.r   )r	   r!   )segmentr*   segment_keys      r   rB   rB   T  s+    $+GZ$@$@M%...gK##r/   )FF)r   r   FF)rW   rg   loggingnumpyr"   scipy.sparsesparserP   2gensim.topic_coherence.direct_confirmation_measurer   r   	getLoggerrT   r   r   r.   r   r2   r%   r]   r5   rB   rX   r/   r   <module>rw      s3   4                h h h h h h h h		8	$	$G G G GT FK<A@ @ @ @Fs& s& s& s& s& s& s& s&l= = =&F F F1 1 1  $ $ $ $ $r/   