o
    V`J                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	Z	 ddlm
Z
 ddlmZ ddlZddlZdd	lmZ dd
lmZ ejdk rOejZnejZ		dddZ			dddZ				dddZG dd deZdd ZdS )z(Utilities for text input preprocessing.
    )absolute_import)division)print_functionN)OrderedDict)defaultdict)md5)range)zip   !!"#$%&()*+,-./:;<=>?@[\]^_`{|}~	
T c                    s   |r|   } tjdk r@t| tr fdd|D }| |} n3t dkr4t| t| }| |} n|D ]}| | } q6n fdd|D }t|}| |} | 	 }dd |D S )a  Converts a text to a sequence of words (or tokens).

    # Arguments
        text: Input text (string).
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to convert the input to lowercase.
        split: str. Separator for word splitting.

    # Returns
        A list of words (or tokens).
    r
   c                    s   i | ]	}t |t qS  )ordunicode.0csplitr   8lib/python3.10/site-packages/keras_preprocessing/text.py
<dictcomp>/   s    z)text_to_word_sequence.<locals>.<dictcomp>   c                    s   i | ]}| qS r   r   r   r   r   r   r   :   s    c                 S   s   g | ]}|r|qS r   r   )r   ir   r   r   
<listcomp>?       z)text_to_word_sequence.<locals>.<listcomp>)
lowersysversion_info
isinstancer   	translatelen	maketransreplacer   )textfiltersr   r   Ztranslate_mapr   Ztranslate_dictseqr   r   r   text_to_word_sequence   s&   




r'   c                 C   s   t | |t|||dS )a  One-hot encodes a text into a list of word indexes of size n.

    This is a wrapper to the `hashing_trick` function using `hash` as the
    hashing function; unicity of word to index mapping non-guaranteed.

    # Arguments
        text: Input text (string).
        n: int. Size of vocabulary.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.

    # Returns
        List of integers in [1, n]. Each integer encodes a word
        (unicity non-guaranteed).
    )hash_functionr%   r   r   )hashing_trickhash)r$   nr%   r   r   r   r   r   one_hotB   s   r,   c                    sB    du rt  n dkrdd  t| |||d} fdd|D S )a  Converts a text to a sequence of indexes in a fixed-size hashing space.

    # Arguments
        text: Input text (string).
        n: Dimension of the hashing space.
        hash_function: defaults to python `hash` function, can be 'md5' or
            any function that takes in input a string and returns a int.
            Note that 'hash' is not a stable hashing function, so
            it is not consistent across different runs, while 'md5'
            is a stable hashing function.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n``,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.

    # Returns
        A list of integer word indices (unicity non-guaranteed).

    `0` is a reserved index that won't be assigned to any word.

    Two or more words may be assigned to the same index, due to possible
    collisions by the hashing function.
    The [probability](
        https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
    of a collision is in relation to the dimension of the hashing space and
    the number of distinct objects.
    Nr   c                 S   s   t t|   dS )N   )intr   encodeZ	hexdigest)wr   r   r   r(      s   z$hashing_trick.<locals>.hash_function)r%   r   r   c                    s    g | ]} |d   d  qS )r   r   )r   r0   r(   r+   r   r   r      s     z!hashing_trick.<locals>.<listcomp>)r*   r'   )r$   r+   r(   r%   r   r   r&   r   r1   r   r)   _   s   !r)   c                   @   s|   e Zd ZdZ							ddd	Zd
d Zdd Zdd Zdd Zdd Z	dd Z
d ddZd ddZdd Zdd ZdS )!	TokenizeraO  Text tokenization utility class.

    This class allows to vectorize a text corpus, by turning each
    text into either a sequence of integers (each integer being the index
    of a token in a dictionary) or into a vector where the coefficient
    for each token could be binary, based on word count, based on tf-idf...

    # Arguments
        num_words: the maximum number of words to keep, based
            on word frequency. Only the most common `num_words-1` words will
            be kept.
        filters: a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.
        lower: boolean. Whether to convert the texts to lowercase.
        split: str. Separator for word splitting.
        char_level: if True, every character will be treated as a token.
        oov_token: if given, it will be added to word_index and used to
            replace out-of-vocabulary words during text_to_sequence calls

    By default, all punctuation is removed, turning the texts into
    space-separated sequences of words
    (words maybe include the `'` character). These sequences are then
    split into lists of tokens. They will then be indexed or vectorized.

    `0` is a reserved index that won't be assigned to any word.
    Nr   Tr   Fr   c           	      K   s   d|v rt d |d}|rtdt| t | _tt| _	|| _
|| _|| _|| _|| _|| _|| _tt| _i | _i | _d S )NZnb_wordszDThe `nb_words` argument in `Tokenizer` has been renamed `num_words`.z Unrecognized keyword arguments: )warningswarnpop	TypeErrorstrr   word_countsr   r.   	word_docsr%   r   r   	num_wordsdocument_count
char_level	oov_token
index_docs
word_index
index_word)	selfr:   r%   r   r   r<   r=   r;   kwargsr   r   r   __init__   s"   	




zTokenizer.__init__c                 C   sl  |D ][}|  j d7  _ | jst|tr*| jr't|tr#dd |D }n| }|}n
t|| j| j| j}|D ]}|| jv rG| j|  d7  < q6d| j|< q6t	|D ]}| j
|  d7  < qQqt| j }|jdd dd | jdu rvg }n| jg}|d	d
 |D  tt|ttdt|d | _dd | j D | _t| j
 D ]\}}|| j| j| < qdS )a  Updates internal vocabulary based on a list of texts.

        In the case where texts contains lists,
        we assume each entry of the lists to be a token.

        Required before using `texts_to_sequences` or `texts_to_matrix`.

        # Arguments
            texts: can be a list of strings,
                a generator of strings (for memory-efficiency),
                or a list of list of strings.
        r   c                 S      g | ]}|  qS r   r   r   Z	text_elemr   r   r   r      r   z*Tokenizer.fit_on_texts.<locals>.<listcomp>c                 S   s   | d S )Nr   r   )xr   r   r   <lambda>   s    z(Tokenizer.fit_on_texts.<locals>.<lambda>T)keyreverseNc                 s   s    | ]}|d  V  qdS )r   Nr   )r   Zwcr   r   r   	<genexpr>   s    z)Tokenizer.fit_on_texts.<locals>.<genexpr>c                 S   s   i | ]\}}||qS r   r   )r   r0   r   r   r   r   r      s    z*Tokenizer.fit_on_texts.<locals>.<dictcomp>)r;   r<   r   listr   r'   r%   r   r8   setr9   itemssortr=   extenddictr	   r   r!   r?   r@   r>   )rA   textsr$   r&   r0   ZwcountsZ
sorted_vocr   r   r   r   fit_on_texts   sB   


zTokenizer.fit_on_textsc                 C   sD   |  j t|7  _ |D ]}t|}|D ]}| j|  d7  < qqdS )a%  Updates internal vocabulary based on a list of sequences.

        Required before using `sequences_to_matrix`
        (if `fit_on_texts` was never called).

        # Arguments
            sequences: A list of sequence.
                A "sequence" is a list of integer word indices.
        r   N)r;   r!   rM   r>   )rA   	sequencesr&   r   r   r   r   fit_on_sequences   s   
zTokenizer.fit_on_sequencesc                 C      t | |S )aN  Transforms each text in texts to a sequence of integers.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Returns
            A list of sequences.
        )rL   texts_to_sequences_generator)rA   rR   r   r   r   texts_to_sequences     zTokenizer.texts_to_sequencesc           	      c   s    | j }| j| j}|D ]_}| jst|tr.| jr+t|tr'dd |D }n| }|}n
t|| j	| j| j
}g }|D ],}| j|}|dur^|rX||krX|durW|| q<|| q<| jdurh|| q<|V  qdS )a  Transforms each text in `texts` to a sequence of integers.

        Each item in texts can also be a list,
        in which case we assume each item of that list to be a token.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        c                 S   rD   r   rE   rF   r   r   r   r   0  r   z:Tokenizer.texts_to_sequences_generator.<locals>.<listcomp>N)r:   r?   getr=   r<   r   rL   r   r'   r%   r   append)	rA   rR   r:   oov_token_indexr$   r&   vectr0   r   r   r   r   rW     s:   



z&Tokenizer.texts_to_sequences_generatorc                 C   rV   )aY  Transforms each sequence into a list of text.

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            sequences: A list of sequences (list of integers).

        # Returns
            A list of texts (strings)
        )rL   sequences_to_texts_generator)rA   rT   r   r   r   sequences_to_textsF  rY   zTokenizer.sequences_to_textsc                 c   s    | j }| j| j}|D ]A}g }|D ]2}| j|}|dur8|r2||kr2|dur1|| j|  q|| q| jdurE|| j|  qd|}|V  qdS )a  Transforms each sequence in `sequences` to a list of texts(strings).

        Each sequence has to a list of integers.
        In other words, sequences should be a list of sequences

        Only top `num_words-1` most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            sequences: A list of sequences.

        # Yields
            Yields individual texts.
        Nr   )r:   r?   rZ   r=   r@   r[   join)rA   rT   r:   r\   r&   r]   ZnumZwordr   r   r   r^   T  s&   

z&Tokenizer.sequences_to_texts_generatorbinaryc                 C   s   |  |}| j||dS )zConvert a list of texts to a Numpy matrix.

        # Arguments
            texts: list of strings.
            mode: one of "binary", "count", "tfidf", "freq".

        # Returns
            A Numpy matrix.
        )mode)rX   sequences_to_matrix)rA   rR   rb   rT   r   r   r   texts_to_matrixt  s   

zTokenizer.texts_to_matrixc                 C   sL  | j s| jrt| jd }ntd| j }|dkr | js tdtt||f}t|D ]v\}}|s4q-tt	}|D ]}||krAq:||  d7  < q:t
| D ]R\}}	|dkr_|	|| |< qP|dkrn|	t| || |< qP|dkryd|| |< qP|dkrdt|	 }
td| jd| j|d   }|
| || |< qPtd	|q-|S )
a  Converts a list of sequences into a Numpy matrix.

        # Arguments
            sequences: list of sequences
                (a sequence is a list of integer word indices).
            mode: one of "binary", "count", "tfidf", "freq"

        # Returns
            A Numpy matrix.

        # Raises
            ValueError: In case of invalid `mode` argument,
                or if the Tokenizer requires to be fit to sample data.
        r   zKSpecify a dimension (`num_words` argument), or fit on some text data first.Ztfidfz7Fit the Tokenizer on some data before using tfidf mode.countZfreqra   r   zUnknown vectorization mode:)r:   r?   r!   
ValueErrorr;   npZzeros	enumerater   r.   rL   rN   logr>   rZ   )rA   rT   rb   r:   rG   r   r&   Zcountsjr   ZtfZidfr   r   r   rc     s@   

zTokenizer.sequences_to_matrixc                 C   sh   t | j}t | j}t | j}t | j}t | j}| j| j| j	| j
| j| j| j|||||dS )a:  Returns the tokenizer configuration as Python dictionary.
        The word count dictionaries used by the tokenizer get serialized
        into plain JSON, so that the configuration can be read by other
        projects.

        # Returns
            A Python dictionary with the tokenizer configuration.
        )r:   r%   r   r   r<   r=   r;   r8   r9   r>   r@   r?   )jsondumpsr8   r9   r>   r?   r@   r:   r%   r   r   r<   r=   r;   )rA   Zjson_word_countsZjson_word_docsZjson_index_docsZjson_word_indexZjson_index_wordr   r   r   
get_config  s$   	zTokenizer.get_configc                 K   s(   |   }| jj|d}tj|fi |S )a  Returns a JSON string containing the tokenizer configuration.
        To load a tokenizer from a JSON string, use
        `keras.preprocessing.text.tokenizer_from_json(json_string)`.

        # Arguments
            **kwargs: Additional keyword arguments
                to be passed to `json.dumps()`.

        # Returns
            A JSON string containing the tokenizer configuration.
        )
class_nameconfig)rm   	__class____name__rk   rl   )rA   rB   ro   tokenizer_configr   r   r   to_json  s
   zTokenizer.to_json)Nr   Tr   FNr   )ra   )rq   
__module____qualname____doc__rC   rS   rU   rX   rW   r_   r^   rd   rc   rm   rs   r   r   r   r   r2      s(    
6+
 
7r2   c           	      C   s   t | }|d}t |d}t |d}t |d}dd | D }t |d}dd | D }t |d	}tdi |}||_||_||_||_	||_
|S )zParses a JSON tokenizer configuration file and returns a
    tokenizer instance.

    # Arguments
        json_string: JSON string encoding a tokenizer configuration.

    # Returns
        A Keras Tokenizer instance
    ro   r8   r9   r>   c                 S      i | ]	\}}t ||qS r   r.   r   kvr   r   r   r         z'tokenizer_from_json.<locals>.<dictcomp>r@   c                 S   rw   r   rx   ry   r   r   r   r     r|   r?   Nr   )rk   loadsrZ   r5   rN   r2   r8   r9   r>   r?   r@   )	Zjson_stringrr   ro   r8   r9   r>   r@   r?   Z	tokenizerr   r   r   tokenizer_from_json  s    


r~   )r   Tr   )Nr   Tr   )rv   Z
__future__r   r   r   stringr   r3   collectionsr   r   Zhashlibr   rk   Znumpyrg   Z	six.movesr   r	   r   r"   r7   r'   r,   r)   objectr2   r~   r   r   r   r   <module>   sB   

)

.  _