
    Ofc                     >    d dl Z d dlZddlmZ  G d de          ZdS )    N   )ProbingStatec                       e Zd ZdZddZd Zed             Zd Zed             Z	d Z
ed	             Zed
             Zed             ZdS )CharSetProbergffffff?Nc                 ^    d | _         || _        t          j        t                    | _        d S N)_statelang_filterlogging	getLogger__name__logger)selfr
   s     5lib/python3.11/site-packages/chardet/charsetprober.py__init__zCharSetProber.__init__'   s'    &'11    c                 (    t           j        | _        d S r   )r   	DETECTINGr	   r   s    r   resetzCharSetProber.reset,   s    ",r   c                     d S r    r   s    r   charset_namezCharSetProber.charset_name/   s    tr   c                     d S r   r   )r   bufs     r   feedzCharSetProber.feed3   s    r   c                     | j         S r   )r	   r   s    r   statezCharSetProber.state6   s
    {r   c                     dS )Ng        r   r   s    r   get_confidencezCharSetProber.get_confidence:   s    sr   c                 2    t          j        dd|           } | S )Ns   ([ -])+    )resub)r   s    r   filter_high_byte_onlyz#CharSetProber.filter_high_byte_only=   s    f&c22
r   c                    t                      }t          j        d|           }|D ]Z}|                    |dd                    |dd         }|                                s|dk     rd}|                    |           [|S )u9  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [-ÿ]
        marker: everything else [^a-zA-Z-ÿ]

        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.

        This filter applies to all scripts which do not use English characters.
        s%   [a-zA-Z]*[-]+[a-zA-Z]*[^a-zA-Z-]?N   r"   )	bytearrayr#   findallextendisalpha)r   filteredwordsword	last_chars        r   filter_international_wordsz(CharSetProber.filter_international_wordsB   s     ;;
 
O     
	' 
	'DOOD"I&&& RSS	I$$&& !9w+> ! 	OOI&&&&r   c                    t                      }d}d}t          t          |                     D ]y}| ||dz            }|dk    rd}n|dk    rd}|dk     rS|                                s?||k    r4|s2|                    | ||                    |                    d           |dz   }z|s|                    | |d	                    |S )
a  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        Also retains English alphabet and high byte characters immediately
        before occurrences of >.

        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   r      >   <Tr(   r"   N)r)   rangelenr,   r+   )r   r-   in_tagprevcurrbuf_chars         r   filter_with_english_lettersz)CharSetProber.filter_with_english_lettersg   s    ;;#c((OO 	  	 D4q=)H4 T!  '!  (*:*:*<*<  $; *v * OOCT	N333OOD)))ax  	( OOCJ'''r   r   )r   
__module____qualname__SHORTCUT_THRESHOLDr   r   propertyr   r   r   r    staticmethodr%   r1   r;   r   r   r   r   r   #   s        2 2 2 2
- - -   X     X     \ " " \"H ) ) \) ) )r   r   )r   r#   enumsr   objectr   r   r   r   <module>rC      si   :  				      n n n n nF n n n n nr   