
    c                      T    d Z ddlZddlZ ej        e          ZdZddZd Z	d	dZ
dS )
zaThis module contains functions to compute direct confirmation on a pair of words or word subsets.    Ng-q=Fc                 r   g }t          |j                  }| D ]}g }|D ]r\  }}		 ||	         }
|||	f         }t          j        ||z  t          z   |
|z  z            }n# t
          $ r d}Y nt          $ r d}Y nw xY w|                    |           s|                    t          |||                     |S )a  Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`.
    This is defined as :math:`m_{lc}(S_i) = log \frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`.

    Parameters
    ----------
    segmented_topics : list of lists of (int, int)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`,
        :func:`~gensim.topic_coherence.segmentation.s_one_one`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as the number of pairwise
        similarity comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list of float
        Log conditional probabilities measurement for each topic.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis
        >>> from collections import namedtuple
        >>>
        >>> # Create dictionary
        >>> id2token = {1: 'test', 2: 'doc'}
        >>> token2id = {v: k for k, v in id2token.items()}
        >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
        >>>
        >>> # Initialize segmented topics and accumulator
        >>> segmentation = [[(1, 2)]]
        >>>
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>>
        >>> # result should be ~ ln(1 / 2) = -0.693147181
        >>> result = direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0]

    g        )	floatnum_docsnplogEPSILONKeyErrorZeroDivisionErrorappendaggregate_segment_sims)segmented_topicsaccumulatorwith_stdwith_supporttopic_coherencesr   s_isegment_simsw_primew_starw_star_countco_occur_countm_lc_is                Rlib/python3.11/site-packages/gensim/topic_coherence/direct_confirmation_measure.pylog_conditional_probabilityr      s	   \ [)**H ^ ^" 	( 	(OGV*62!,Wf_!=.8";w!F<ZbKb cdd   $    '''' 6|X| \ \]]]]s   7AA9+A98A9c                    t          j        |           }|g}|r'|                    t          j        |                      |r"|                    t	          |                      t	          |          dk    r|d         nt          |          S )a  Compute various statistics from the segment similarities generated via set pairwise comparisons
    of top-N word lists for a single topic.

    Parameters
    ----------
    segment_sims : iterable of float
        Similarity values to aggregate.
    with_std : bool
        Set to True to include standard deviation.
    with_support : bool
        Set to True to include number of elements in `segment_sims` as a statistic in the results returned.

    Returns
    -------
    (float[, float[, int]])
        Tuple with (mean[, std[, support]]).

    Examples
    ---------
    .. sourcecode:: pycon

        >>> from gensim.topic_coherence import direct_confirmation_measure
        >>>
        >>> segment_sims = [0.2, 0.5, 1., 0.05]
        >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, True, True)
        (0.4375, 0.36293077852394939, 4)
        >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, False, False)
        0.4375

       r   )r   meanr   stdlentuple)r   r   r   r   statss        r   r   r   W   s    > 7<  DFE +RVL))*** (S&&'''5zzQ8588E%LL8    c                    g }t          |j                  }| D ]}g }|D ]\  }	}
||	         }||
         }||	|
f         }|r@t          |	|
fgg|          d         }||z  }|t          j        |t
          z              z  }n/||z  t
          z   }||z  ||z  z  }t          j        ||z            }|                    |           |                    t          |||                     |S )a	  Compute log ratio measure for `segment_topics`.

    Parameters
    ----------
    segmented_topics : list of lists of (int, int)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`,
        :func:`~gensim.topic_coherence.segmentation.s_one_one`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`.
    normalize : bool, optional
        Details in the "Notes" section.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as the number of pairwise
        similarity comparisons were used to compute the overall topic coherence.

    Notes
    -----
    If `normalize=False`:
        Calculate the log-ratio-measure, popularly known as **PMI** which is used by coherence measures such as `c_v`.
        This is defined as :math:`m_{lr}(S_i) = log \frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}`

    If `normalize=True`:
        Calculate the normalized-log-ratio-measure, popularly knowns as **NPMI**
        which is used by coherence measures such as `c_v`.
        This is defined as :math:`m_{nlr}(S_i) = \frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}`

    Returns
    -------
    list of float
        Log ratio measurements for each topic.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis
        >>> from collections import namedtuple
        >>>
        >>> # Create dictionary
        >>> id2token = {1: 'test', 2: 'doc'}
        >>> token2id = {v: k for k, v in id2token.items()}
        >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
        >>>
        >>> # Initialize segmented topics and accumulator
        >>> segmentation = [[(1, 2)]]
        >>>
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>>
        >>> # result should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
        >>> result = direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0]

    r   )r   r   log_ratio_measurer   r   r   r   r   )r   r   	normalizer   r   r   r   r   r   r   r   w_prime_countr   r   	numeratorco_doc_probm_lr_idenominators                     r   r$   r$      s0   t [)**H ^ ^" 	( 	(OGV'0M&v.L(&9N 	9-&0A/B.C[QQRST	,x7"rvkG.C'D'D&DE ,h6'A	,x7L8<ST	K 788'''' 6|X| \ \]]]]r"   )FF)FFF)__doc__loggingnumpyr   	getLogger__name__loggerr   r   r   r$    r"   r   <module>r2      s    h g     		8	$	$ A A A AH&9 &9 &9RR R R R R Rr"   