
    G@d1|                         d Z ddlmZ ddlZddlZd Zd Z G d d          Z G d d	e          Z	 G d
 d          Z
dS )a  
Main module for computing DAFSA/DAWG graphs from list of strings.

The library computes a Deterministic Acyclic Finite State Automata from a
list of sequences in a non incremental way, with no plans to expand to
incremental computation. The library was originally based on public domain
code by `Steve Hanov (2011) <http://stevehanov.ca/blog/?id=115>`__.

Adapted from dafsa/dafsa.py of
`DAFSA <https://github.com/tresoldi/dafsa>`_.
    )CounterNc                     d}t          t          t          |           t          |                              D ]}| |         ||         k    r n|dz  }|S )a  
    Return the length of the common prefix between two sequences.
    Parameters
    ----------
    seq_a : iter
        An iterable holding the first sequence.
    seq_b : iter
        An iterable holding the second sequence.
    Returns
    -------
    length: int
        The length of the common prefix between `seq_a` and `seq_b`.
    Examples
    --------
    >>> import dafsa
    >>> dafsa.utils.common_prefix_length("abcde", "abcDE")
    3
    >>> dafsa.utils.common_prefix_length("abcde", "ABCDE")
    0
    r      )rangeminlen)seq_aseq_bcommon_prefix_lenis       Alib/python3.11/site-packages/spyder/utils/external/dafsa/dafsa.pycommon_prefix_lengthr      sc    , 3s5zz3u::..//  8uQxEQ    c                 p    t          j        |           \  }}t          |d           t          ||          S )af  
    Iterate pairwise over an iterable.
    The function follows the recipe offered on Python's `itertools`
    documentation.
    Parameters
    ----------
    iterable : iter
        The iterable to be iterate pairwise.
    Examples
    --------
    >>> import dafsa
    >>> list(dafsa.utils.pairwise([1,2,3,4,5]))
    [(1, 2), (2, 3), (3, 4), (4, 5)]
    N)	itertoolsteenextzip)iterableelem_aelem_bs      r   pairwiser   <   s7      ]8,,NFFvvr   c                   <    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	S )
	DAFSANodea  
    Class representing node objects in a DAFSA.

    Each object carries an internal ``node_id`` integer identifier which must
    be locally unique within a DAFSA, but is meaningless. There is no
    implicit order nor a sequential progression must be observed.

    As in previous implementation by Hanov (2011), minimization is performed
    by comparing nodes, with equivalence determined by the standard
    Python ``.__eq__()`` method which overloads the equality operator. Nodes
    are considered identical if they have identical edges, which edges
    pointing from or to the same node. In particular, edge weight and node
    finalness, respectively expressed by the ``.weight`` and ``.final``
    properties, are *not* considered. This allows to correctly count edges
    after minimization and to have final pass-through nodes.

    Parameters
    ----------
    node_id : int
        The global unique ID for the current node.
    c                 >    i | _         d| _        d| _        || _        dS )z*
        Initializes a DAFSANode.
        Fr   N)edgesfinalweightnode_id)selfr   s     r   __init__zDAFSANode.__init__k   s&     

 r   c                 n     d                      fdt           j                  D                       }|S )a  
        Return a textual representation of the node.

        The representation lists any edge, with ``id`` and ``attr``ibute. The
        edge dictionary is sorted at every call, so that, even if
        more expansive computationally, the function is guaranteed to be
        idempotent in all implementations.

        Please note that, as counts and final state are not accounted for,
        the value returned by this method might be ambiguous, with different
        nodes returning the same value. For unambigous representation,
        the ``.__repr__()`` method must be used.

.. code:: python
        >>> from dafsa import DAFSANode, DAFSAEdge
        >>> node = DAFSANode(0)
        >>> node.final = True
        >>> node.edges["x"] = DAFSAEdge(DAFSANode(1), 1)
        >>> str(node)
        'x|1'

        Returns
        -------
        string : str
            The (potentially ambiguous) textual representation of the
            current node.
        ;c                 H    g | ]}d |j         |         j        j        fz  S )z%s|%ir   noder   .0labelr    s     r   
<listcomp>z%DAFSANode.__str__.<locals>.<listcomp>   s@        5$*U"3"8"@AA  r   )joinsortedr   r    bufs   ` r   __str__zDAFSANode.__str__x   sO    > hh   #DJ//  
 
 
r   c                      d                     d                      fdt           j                  D                       g          } j        dk    rd|z  }n j        rd|z  }nd|z  }|S )a  
        Return an unambigous textual representation of the node.

        The representation lists any edge, with all properties. The
        edge dictionary is sorted at every call, so that, even if
        more expansive computationally, the function is guaranteed to be
        idempotent in all implementations.

        Please note that, as the return value includes information such as
        edge weight, it cannot be used for minimization. For such purposes,
        the potentially ambiguous ``.__str__()`` method must be used.

.. code:: python
        >>> from dafsa import DAFSANode, DAFSAEdge
        >>> node = DAFSANode(0)
        >>> node.final = True
        >>> node.edges["x"] = DAFSAEdge(DAFSANode(1), 1)
        >>> repr(node)
        '0(#1/0:<x>/1)'

        Returns
        -------
        string : str
            The unambiguous textual representation of the current node.
        r#   |c                 v    g | ]5}d j         |         j        j        j        |j         |         j        fz  6S )z#%i/%i:<%s>/%i)r   r&   r   r   r'   s     r   r*   z&DAFSANode.__repr__.<locals>.<listcomp>   sZ     	 	 	 " ) Ju-2: K! Ju-4		 	 	r   r   z0(%s)zF(%s)zn(%s))r+   r,   r   r   r   r-   s   ` r   __repr__zDAFSANode.__repr__   s    : hh	 	 	 	 &,DJ%7%7	 	 	 
 
& <1C-CCZ 	 C-CCC-C
r   c                    t          | j                  t          |j                  k    rdS | j        |j        k    rdS | j        D ]A}||j        vr dS | j        |         j        j        |j        |         j        j        k    r dS BdS )a  
        Checks whether two nodes are equivalent.

        Please note that this method checks for *equivalence* (in particular,
        disregarding edge weight), and not for *equality*.

        Paremeters
        ----------
        other : DAFSANode
            The DAFSANode to be compared with the current one.

        Returns
        -------
        eq : bool
            A boolean indicating if the two nodes are equivalent.
        FT)r   r   r   r&   r   )r    otherr)   s      r   __eq__zDAFSANode.__eq__   s    * tz??c%+....5 :$$5 Z 	 	EEK''uu 
5!&.;u%*23 3 uu3
 tr   c                 V    |                                  |                                 k    S )a@  
        Return a "greater than" comparison between two nodes.

        Internally, the method reuses the ``.__str__()`` method, so that
        the logic for comparison is implemented in a single place. As such,
        while it guarantees idempotency when sorting nodes, it does not
        check for properties suc like "node length", "entropy", or
        "information amount", only providing a convenient complementary
        method to ``.__eq__()``.

        Paremeters
        ----------
        other : DAFSANode
            The DAFSANode to be compared with the current one.

        Returns
        -------
        gt : bool
            A boolean indicating if the current node is greater than the one
            it is compared with (that is, if it should be placed after it
            in an ordered sequence).
        )r/   )r    r5   s     r   __gt__zDAFSANode.__gt__  s    0 ||~~//r   c                 N    |                                                                  S )a@  
        Return a hash for the node.

        The returned has is based on the potentially ambigous string
        representation provided by the ``.__str__()`` method, allowing to
        use nodes as, among others, dictionary keys. The choice of the
        potentially ambiguous ``.__str__()`` over ``.__repr__()`` is intentional
        and by design and complemented by the ``.repr_hash()`` method.

        Returns
        -------
        hash : number
            The hash from the (potentially ambigous) textual representation of
            the current node.
        r/   __hash__r    s    r   r;   zDAFSANode.__hash__      " ||~~&&(((r   c                 N    |                                                                  S )a  
        Return a hash for the node.

        The returned has is based on the unambigous string
        representation provided by the ``.__repr__()`` method, allowing to
        use nodes as, among others, dictionary keys. The method is
        complemented by the ``.__hash__()`` one.

        Returns
        -------
        hash : number
            The hash from the unambigous textual representation of the
            current node.
        r3   r;   r<   s    r   	repr_hashzDAFSANode.repr_hash1        }}'')))r   N)__name__
__module____qualname____doc__r!   r/   r3   r6   r8   r;   r@    r   r   r   r   R   s         0  & & &P7 7 7r) ) )V0 0 04) ) )&* * * * *r   r   c                   <     e Zd ZdZd fd	Zd Zd Zd Zd Z xZ	S )		DAFSAEdgeaS  
    Class representing edge objects in a DAFSA.

    This class overloads a normal Python dictionary, and in simpler
    implementations could potentially be replaced with a pure dictionary.
    It was implemented as its own object for homogeneity and for planned
    future expansions, particularly in terms of fuzzy automata.

    Parameters
    ----------
    node : DAFSANode
        Reference to the target node, mandatory. Please note that it
        must be a DAFSANode object and *not* a node id.
    weight : int
        Edge weight as collected from training data. Defaults to 0.
    r   c                     t                                                       t          |t                    st	          d          || _        || _        dS )z+
        Initializes a DAFSA edge.
        z=`node` must be a DAFSANode (perhaps a `node_id` was passed?).N)superr!   
isinstancer   	TypeErrorr&   r   )r    r&   r   	__class__s      r   r!   zDAFSAEdge.__init__V  sX     	 $	** 	O   	r   c                 .    d| j         j        | j        fz  S )aG  
        Return a textual representation of the node.

        The representation only include the ``node_id``, without information
        on the node actual contents.

        Returns
        -------
        string : str
            The (potentially ambiguous) textual representation of the
            current edge.
        z{node_id: %i, weight: %i})r&   r   r   r<   s    r   r/   zDAFSAEdge.__str__f  s     +di.?-MMMr   c                 >    dt          | j                  | j        fz  S )a  
        Return a full textual representation of the node.

        The representation includes information on the entire contents of
        the node.

        Returns
        -------
        string : str
            The unambiguous textual representation of the current edge.
        z{node: <%s>, weight: %i})reprr&   r   r<   s    r   r3   zDAFSAEdge.__repr__v  s     *T$)__dk,JJJr   c                 N    |                                                                  S )a@  
        Return a hash for the edge.

        The returned has is based on the potentially ambigous string
        representation provided by the ``.__str__()`` method, allowing to
        use edges as, among others, dictionary keys. The choice of the
        potentially ambiguous ``.__str__()`` over ``.__repr__()`` is intentional
        and by design and complemented by the ``.repr_hash()`` method.

        Returns
        -------
        hash : number
            The hash from the (potentially ambigous) textual representation of
            the current edge.
        r:   r<   s    r   r;   zDAFSAEdge.__hash__  r=   r   c                 N    |                                                                  S )a  
        Return a hash for the edge.

        The returned has is based on the unambigous string
        representation provided by the ``.__repr__()`` method, allowing to
        use edges as, among others, dictionary keys. The method is
        complemented by the ``.__hash__()`` one.

        Returns
        -------
        hash : number
            The hash from the unambigous textual representation of the
            current edge.
        r?   r<   s    r   r@   zDAFSAEdge.repr_hash  rA   r   )r   )
rB   rC   rD   rE   r!   r/   r3   r;   r@   __classcell__)rM   s   @r   rH   rH   D  s         "      N N N K K K) ) )&* * * * * * *r   rH   c                   V    e Zd ZdZd Zd Zd Zd Zd Zd Z	dd	Z
d
 Zd Zd Zd ZdS )DAFSAaL  
    Class representing a DAFSA object.

    Parameters
    ----------
    sequences : list
        List of sequences to be added to the DAFSA object.
    weight : bool
        Whether to collect edge weights after minimization. Defaults
        to ``True``.
    condense: bool
        Whether to join sequences of transitions into single compound
        transitions whenever possible. Defaults to ``False``.
    delimiter : str
        The delimiter to use in case of joining single path transitions.
        Defaults to a single white space (`" "`).
    minimize : bool
        Whether to minimize the trie into a DAFSA. Defaults to ``True``; this
        option is implemented for development and testing purposes and
        it is not intended for users (there are specific and better libraries
        and algorithms to build tries).
    c                    |                     dd          | _        |                     dd          }t          j                    | _        dt          t          | j                            i| _        d| _        g | _	        d| _
        t          |          }t          |          | _
        t          dg|z             D ]\  }}|                     |||           |                     d|           |                     dd          r|                     |           t#          j        | j                  | _        |                     d	d
          r|                                  dS dS )z-
        Initializes a DAFSA object.
        	delimiter minimizeTr   N r   condenseF)get
_delimiterr   count_iditerr   r   nodeslookup_nodes_unchecked_nodes_num_sequencesr,   r   r   _insert_single_seq	_minimize_collect_weightscopydeepcopyr[   )r    	sequenceskwargsrY   previous_seqseqs         r   r!   zDAFSA.__init__  sa    !**[#66::j$// !(( 4#5#5667
  !# # 9%%	!)nn "*2$*:!;!; 	A 	AL###Cx@@@@ 	q(### ::h%% 	-!!),,, !M$*55::j%(( 	MMOOOOO	 	r   c                 z   t          ||          }|                     ||           | j        s| j        d         }n| j        d         d         }||d         D ]Z}t	          t          | j                            }t          |          |j        |<   | j        	                    |||d           |}[d|_
        dS )a  
        Internal method for single sequence insertion.

        Parameters
        ----------
        seq: sequence
            The sequence being inserted.
        previous_seq : sequence
            The previous sequence from the sorted list of sequences,
            for common prefix length computation.
        minimize : bool
            Flag indicating whether to perform minimization or not.
        r   childN)parenttokenro   T)r   re   rb   r`   r   r   r_   rH   r   appendr   )r    rl   rk   rY   
prefix_lenr&   rq   ro   s           r   rd   zDAFSA._insert_single_seq
  s    ( *#|<<
z8,,, $ 	6:a=DD(,W5D % 		 		E d4<0011E )% 0 0DJu!((%%@@   DD 


r   c                    	 d}t          t          | j                  |z
            D ]}| j                                        }|d         }|d         }|d         }|s|| j        |j        <   Ed}	| j                                        D ]\  }
}||k    r|
}	 n|	rI|j        |         j        j	        rd| j        |	         _	        | j        |	         |j        |         _        d}|| j        |j        <   |sdS )a  
        Internal method for graph minimization.

        Minimize the graph from the last unchecked item until ``index``.
        Final minimization, with ``index`` equal to zero, will traverse the
        entire data structure.

        The method allows the minimization to be overridden by setting to
        ``False`` the ``minimize`` flag (returning a trie). Due to the logic in
        place for the DAFSA minimization, this ends up executed as a
        non-efficient code, where all comparisons fail, but it is
        necessary to do it this way to clean the list of unchecked nodes.
        This is not an implementation problem: this class is not supposed
        to be used for generating tries (there are more efficient ways of
        doing that), but it worth having the flag in place for experiments.

        Parameters
        ----------
        index : int
            The index until the sequence minimization, right to left.
        minimize : bool
            Flag indicating whether to perform minimization or not.
        TFrp   rq   ro   N)
r   r   rb   popr`   r   itemsr   r&   r   )r    indexrY   graph_changed_unchecked_noderp   rq   ro   	child_idxnode_idxr&   s               r   re   zDAFSA._minimize;  s7   >/	!M3t455=>> ': ': "&!6!:!:!<!<'1&w/&w/   :05DJu}--
 !%I*.**:*:*<*< " "$5==(0I!E ) ! : "<.39 ?:>DJy1737:i3HU+0 )-49
5=11 ! _/	r   c                 :    	 |                                  dk    rdS )a  
        Condenses the automaton, merging single-child nodes with their parents.

        The function joins paths of unique edges into single edges with
        compound transitions, removing redundant nodes. A redundant node
        is defined as one that (a) is not final, (b) emits a single transition,
        (b) receives a single transition, and (d) its source emits a single
        transition.

        Internally, the function will call the ``._joining_round()``
        method until no more candidates for joining are available.
        Performing everything in a single step would require a more complex
        logic.
        Tr   N)_joining_roundr<   s    r   r[   zDAFSA.condense  s(    "	""$$))	r   c                    	
 g } j                                         D ]\  
|
fd
j        D             z  }t          d |D                       }t          d |D                       }g }g  j                                         D ]\  
|         dk    r|         dk    r
j        r'fd|D             d         		 fd j         	d                  j        D             d         }t          
j                                                  d         }t          fd		D                       r	z  |                    	||d
           |D ]} j	        
                    |d         |d         g          }t           j         |d         d                  j        |d                  j         j         |d         d                  j        |d                  j                   j         |d         d                  j        |<    j         |d         d                  j                            |d                     j                             |d         d                    t          |          S )a  
        Internal method for the unique-edge joining algorithm.

        This function will be called a successive number of times by
        ``._join_transitions()``, until no more candidates for unique-edge
        joining are available (as informed by its return value).

        Returns
        -------
        num_operations: int
            The number of joining operations that was performed. When zero,
            it signals that no more joining is possible.
        c                 D    g | ]}j         |         j        j        d S )sourcetargetr%   )r(   r)   r&   	source_ids     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>  s>        %
50A0F0NOO  r   c                     g | ]
}|d          S )r   rF   r(   edges     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>      <<<d4><<<r   c                     g | ]
}|d          S r   rF   r   s     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>  r   r   r   c                 ,    g | ]}|d          k    |S r   rF   )r(   r   r   s     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>  s'    MMM$4>W3L3L3L3L3Lr   r   c                 x    g | ]6}j         d                   j        |         j        j        d         k    4|7S r   )r`   r   r&   r   )r(   r)   	edge_infor    s     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>  sX       :i128?DLX&' ' ' ' 'r   r   c                     g | ]}|vS rF   rF   )r(   r   transitions_nodess     r   r*   z(DAFSA._joining_round.<locals>.<listcomp>  s    NNNG#44NNNr   )r   
label_fromlabel_tor   r   r   r   )r`   rv   r   r   r   listkeysallrr   r]   r+   rH   r&   r   ru   r   )r    r   sourcestargetstransitionsr   r   
transition	new_labelr   r&   r   r   r   s   `        @@@@@r   r~   zDAFSA._joining_round  s   & #z//11 	 	OIt     !Z   EE <<e<<<==<<e<<<== !Z--// 	 	MGTw!##w!##z  NMMM%MMMaPI    !Z	((;<B  
 J DJOO--..q1H NNNNINNNOO !Y.!"" )&0$,    & 	9 	9J,,L):j+AB I 
:f-h78z*-/
:f-h78z*-/  Jz&)(34: Jz&)(34:>><(   JNN:f-h78888 ;r   c                     |D ]a}| j         d         }|xj        dz  c_        |D ]?}|j        |         xj        dz  c_        |j        |         j        }|xj        dz  c_        @bdS )a-  
        Internal method for collecting node and edge weights from sequences.

        This method requires the minimized graph to be already in place.

        Parameters
        ----------
        sequences : list
            List of sequences whose node and edge weights will be collected.
        r   r   N)r`   r   r   r&   )r    ri   rl   r&   rq   s        r   rf   zDAFSA._collect_weights  s      		! 		!C:a=DKK1KK  ! !
5!((A-((z%(-q !		! 		!r   Fc                     | j         d         }d}|D ]@}||j        vr dS ||j        |         j        z  }|j        |         j        }|r	|j        r nA|j        sdS ||fS )a  
        Check if a sequence can be expressed by the DAFSA.

        The method does not return all possible potential paths, nor
        the cumulative weight: if this is needed, the DAFSA object should
        be converted to a Graph and other libraries, such as ``networkx``,
        should be used.

        Parameters
        ----------
        sequence : sequence
            Sequence to be checked for presence/absence.

        Returns
        -------
        node : tuple of DAFSANode and int, or None
            Either a tuple with a DAFSANode referring to the final state
            that can be reached by following the specified sequence,
            plus the cumulative weight for reaching it, or None if no path
            can be found.
        r   N)ra   r   r   r&   r   )r    sequencestop_on_prefixr&   
cum_weightrq   s         r   lookupzDAFSA.lookup  s    0  # 
 	 	EDJ&&tt$*U+22J:e$)D $*  z 	4Zr   c                 *    t          | j                  S )z
        Return the number of minimized nodes in the structure.

        Returns
        -------
        node_count : int
            Number of minimized nodes in the structure.
        )r   r`   r<   s    r   count_nodeszDAFSA.count_nodes;  s     4:r   c                 b    t          d | j                                        D                       S )z
        Return the number of minimized edges in the structure.

        Returns
        -------
        edge_count : int
            Number of minimized edges in the structure.
        c                 6    g | ]}t          |j                  S rF   )r   r   )r(   r&   s     r   r*   z%DAFSA.count_edges.<locals>.<listcomp>Q  s     DDDC
OODDDr   )sumr`   valuesr<   s    r   count_edgeszDAFSA.count_edgesG  s/     DD
0A0A0C0CDDDEEEr   c                     | j         S )a  
        Return the number of sequences inserted in the structure.

        Please note that the return value mirrors the number of sequences
        provided during initialization, and *not* a set of it: repeated
        sequences are accounted, as each will be added a single time to
        the object.

        Returns
        -------
        seq_count : int
            Number of sequences in the structure.
        )rc   r<   s    r   count_sequenceszDAFSA.count_sequencesS  s     ""r   c                 f   d|                                  |                                 |                                 fz  g}t          | j                  D ]I}| j        |         }|d|t          |          d |j                                        D             fz  gz  }Jd                    |          S )z
        Return a readable multiline textual representation of the object.

        Returns
        -------
        string : str
            The textual representation of the object.
        z8DAFSA with %i nodes and %i edges (%i inserted sequences)z  +-- #%i: %s %sc                 0    g | ]\  }}||j         j        fS rF   )r&   r   )r(   attrns      r   r*   z!DAFSA.__str__.<locals>.<listcomp>~  s%    NNNadAFN+NNNr   
)	r   r   r   r,   r`   rP   r   rv   r+   )r    r.   r   r&   s       r   r/   zDAFSA.__str__d  s     G!!4#3#3#5#5t7K7K7M7MNO
 dj)) 		 		G:g&D"JJNN4:;K;K;M;MNNN CC yy~~r   N)F)rB   rC   rD   rE   r!   rd   re   r[   r~   rf   r   r   r   r   r/   rF   r   r   rU   rU     s         .E E EN/ / /bN N N`  *W  W  W r! ! !.)  )  )  ) V
 
 

F 
F 
F# # #"    r   rU   )rE   collectionsr   rg   r   r   r   r   dictrH   rU   rF   r   r   <module>r      s   
 
              >  ,o* o* o* o* o* o* o* o*dd* d* d* d* d* d* d* d*NX X X X X X X X X Xr   