o
    d%Bev.                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ dd	lmZmZmZmZmZmZ eed
dedefddZeed
dedefddZeed
dedee fddZeed
dedefddZ eed
dedefddZ!eed
dedefddZ"eed
dedefddZ#eed
dedefddZ$eed
dedefddZ%eed
dedefdd Z&eed
dedefd!d"Z'eed
dedefd#d$Z(eed
dedefd%d&Z)eed
dedefd'd(Z*eed
dedefd)d*Z+eed
dedefd+d,Z,ee-ed
d-edefd.d/Z.eed
dedefd0d1Z/d_d3e0d4e1dee fd5d6Z2ed7d
d8edefd9d:Z3d3e0deee e0f fd;d<Z4d=edefd>d?Z5d`dAedBedefdCdDZ6dEedee fdFdGZ7dHedIede8fdJdKZ9dHedIedefdLdMZ:dNej;dOfd8edPe1dQeddfdRdSZ<	dadTe0dUedVe=dWe1dXedYedZe0d[ed\ee deeddf fd]d^Z>dS )b    N)IncrementalDecoder)aliases)	lru_cache)findall)	GeneratorListOptionalSetTupleUnion)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATION)maxsize	characterreturnc                 C   sd   zt | }W n
 ty   Y dS w d|v p1d|v p1d|v p1d|v p1d|v p1d|v p1d|v p1d	|v S )
NFz
WITH GRAVEz
WITH ACUTEzWITH CEDILLAzWITH DIAERESISzWITH CIRCUMFLEXz
WITH TILDEzWITH MACRONzWITH RING ABOVEunicodedataname
ValueErrorr   description r   8lib/python3.10/site-packages/charset_normalizer/utils.pyis_accentuated   s(   r   c                 C   s.   t | }|s	| S |d}tt|d dS )N r      )r   decompositionsplitchrint)r   Z
decomposedZcodesr   r   r   remove_accent(   s
   

r&   c                 C   s.   t | }t D ]\}}||v r|  S qdS )zK
    Retrieve the Unicode range official name from a single character.
    N)ordr   items)r   Zcharacter_ord
range_nameZ	ord_ranger   r   r   unicode_range3   s   r*   c                 C   *   z
t | }W d|v S  ty   Y dS w )NFZLATINr   r   r   r   r   is_latinA   s   r,   c                 C   s2   t | }d|v rdS t| }|d u rdS d|v S )NPTFZPunctuationr   categoryr*   r   character_categorycharacter_ranger   r   r   is_punctuationJ   s   
r3   c                 C   sB   t | }d|v sd|v rdS t| }|d u rdS d|v o |dkS )NSNTFZFormsZLor.   r0   r   r   r   	is_symbolY   s   
r6   c                 C   s$   t | }|d u r
dS d|v pd|v S )NFZ	EmoticonsZPictographs)r*   )r   r2   r   r   r   is_emoticonh   s   r7   c                 C   s.   |   s| dv r
dS t| }d|v p|dv S )N>   +<u   ｜>TZ>   ZPoZPdZPc)isspacer   r/   )r   r1   r   r   r   is_separatorr   s   
r=   c                 C   s   |   |  kS N)islowerisupperr   r   r   r   is_case_variable|   s   rB   c                 C   r+   )NFZCJKr   r   Zcharacter_namer   r   r   is_cjk      rD   c                 C   r+   )NFZHIRAGANAr   rC   r   r   r   is_hiragana   rE   rF   c                 C   r+   )NFZKATAKANAr   rC   r   r   r   is_katakana   rE   rG   c                 C   r+   )NFZHANGULr   rC   r   r   r   	is_hangul   rE   rH   c                 C   r+   )NFZTHAIr   rC   r   r   r   is_thai   rE   rI   c                 C   r+   )NFARABICr   rC   r   r   r   	is_arabic   rE   rK   c                 C   s4   zt | }W n
 ty   Y dS w d|v od|v S )NFrJ   zISOLATED FORMr   rC   r   r   r   is_arabic_isolated_form   s   rL   r)   c                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S r>   r   ).0keywordr)   r   r   	<genexpr>   s    z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rO   r   rO   r   is_unicode_range_secondary   s   rR   c                 C   s(   |   du o|  du o| dko| dkS )NFu   ﻿)r<   isprintablerA   r   r   r   is_unprintable   s   
rU       sequencesearch_zonec                 C   s   t | tstt| }tt| dt|| jddd}t|dkr$dS |D ]'}| 	dd}t
 D ]\}}||krB|    S ||krL|    S q4q&dS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nasciiignoreerrorsr   -_)
isinstancebytes	TypeErrorlenr   r   mindecodelowerreplacer   r(   )rW   rX   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar   r   r   any_specified_encoding   s&   
rj      r   c                 C   s    | dv pt td| jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   utf_8Z	utf_8_sigutf_32utf_7	utf_32_leutf_16	utf_32_be	utf_16_le	utf_16_beencodings.{})
issubclass	importlibimport_moduleformatr   r   )r   r   r   r   is_multi_byte_encoding   s   
ry   c                 C   sJ   t D ] }t | }t|tr|g}|D ]}| |r!||f    S qqdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r   r_   r`   
startswith)rW   iana_encodingZmarksZmarkr   r   r   identify_sig_or_bom  s   

r}   r|   c                 C   s   | dvS )N>   rp   rm   r   )r|   r   r   r   should_strip_sig_or_bom  s   r~   Tcp_namestrictc                 C   sL   |   dd} t D ]\}}| ||fv r|  S q|r$td| | S )Nr]   r^   z Unable to retrieve IANA for '{}')re   rf   r   r(   r   rx   )r   r   rh   ri   r   r   r   	iana_name"  s   r   decoded_sequencec                 C   s4   t  }| D ]}t|}|d u rq|| qt|S r>   )setr*   addlist)r   Zrangesr   r2   r   r   r   
range_scan2  s   r   iana_name_aiana_name_bc           	      C   s   t | st |r
dS td| j}td|j}|dd}|dd}d}tdD ]}t|g}||||krA|d7 }q,|d S )	Ng        rt   rZ   r[   r      r      )ry   rv   rw   rx   r   ranger`   rd   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr   r   r   cp_similarity@  s*   


r   c                 C   s   | t v o	|t |  v S )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r   r   r   is_cp_similarX  s   
r   Zcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)slevelformat_stringc                 C   s:   t | }|| t  }|t | || d S r>   )loggingZ	getLoggerZsetLevelZStreamHandlerZsetFormatterZ	FormatterZ
addHandler)r   r   r   loggerZhandlerr   r   r   set_logging_handlerc  s
   

r   	sequencesri   offsets
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadc	                 c   s&   |r|du r|D ]}	||	|	|  }
|
s d S |
V  q	d S |D ]p}	|	| }|t | d kr/q | |	|	|  }|rA|du rA|| }|j||rHdndd}
|r|	dkrt|d}|r|
d | |vrt|	|	d d	D ]#}| || }|r{|du r{|| }|j|dd}
|
d | |v r nqi|
V  q d S )
NF   rZ   r   r[   r   r!      )rb   rd   rc   r   )r   ri   r   r   r   r   r   r   r   r   chunkZ	chunk_endZcut_sequenceZchunk_partial_size_chkjr   r   r   cut_sequence_chunksp  sD   

r   )rV   )Tr>   )?rv   r   r   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   r   r   r	   r
   r   Z_multibytecodecr   Zconstantr   r   r   r   r   r   strboolr   r&   r*   r,   r3   r6   r7   r=   rB   rD   rF   rG   rH   rI   rK   rL   rb   rR   rU   r`   r%   rj   ry   r}   r~   r   r   floatr   r   INFOr   r   r   r   r   r   r   <module>   s      

									
 
	
