
    3 d1                      ,   d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z!m"Z" dZ# ej$        d          Z%d Z&d Z'd Z( G d d          Z) G d d          Z*dS )z#
Link extractor based on lxml.html
    N)partial)urljoinurlparse)etree)HTMLTranslator)strip_html5_whitespace)canonicalize_urlsafe_url_string)Link)IGNORED_EXTENSIONS_is_valid_url_matches_re_typere)arg_to_iterrel_has_nofollow)unique)get_base_url)url_has_any_extensionurl_is_from_any_domainzhttp://www.w3.org/1999/xhtmlzstring()c                     t          | t                    rO| d         dk    rC| dt          t                    dz            t          k    r|                     d          d         S | S )Nr   {   })
isinstancestrlenXHTML_NAMESPACEsplit)tags    >lib/python3.11/site-packages/scrapy/linkextractors/lxmlhtml.py_nonsr#       sa    #s &q6S= 	&SS%9%9A%=!=>/Q 	&99S>>"%%J    c                     | S N )xs    r"   	_identityr)   '   s    Hr$   c                 .    t          | j        d          S )NT)keep_fragments)r	   url)links    r"   _canonicalize_link_urlr.   +   s    DHT::::r$   c                   @    e Zd Z	 	 	 	 	 	 ddZd Zd Zd	 Zd
 Zd ZdS )LxmlParserLinkExtractorahrefNFTc                 `   t          |          r|nt          t          j        |          | _        t          |          r|nt          t          j        |          | _        t          |          r|nt          | _        || _        || _	        |rt          j
        d          nt          | _        d S )Nr,   )callabler   operatoreqscan_tag	scan_attrr)   process_attrr   strip
attrgetterr.   link_key)selfr!   attrprocessr   r:   canonicalizeds          r"   __init__z LxmlParserLinkExtractor.__init__0   s      (}}K'(+s2K2K!)$OWX[$5O5O'/'8'8GGGi
*7SH&&&=S 	r$   c              #      K   |                     t          j                  D ]Y}|                     t	          |j                            s*|j        }|D ]%}|                     |          s||||         fV  &Zd S r&   )iterr   Elementr7   r#   r!   attribr8   )r=   documentelattribsrE   s        r"   _iter_linksz#LxmlParserLinkExtractor._iter_linksB   s      --.. 	4 	4B==rv// iG! 4 4~~f-- 676?333334		4 	4r$   c           
          g }|                      |j                  D ]\  }}}	 | j        rt          |          }t	          ||          }|                     |          }	|	En# t          $ r Y Rw xY wt          |	|          }	t	          ||	          }	t          |	t          |          pdt          |                    d                              }
|                    |
           |                     |          S )N)encoding rel)nofollow)rI   rootr:   r   r   r9   
ValueErrorr
   r   _collect_string_contentr   getappend_deduplicate_if_needed)r=   selectorresponse_urlresponse_encodingbase_urllinksrG   r>   attr_valr,   r-   s              r"   _extract_linksz&LxmlParserLinkExtractor._extract_linksL   s+   "&"2"28="A"A 	 	Bh	: @5h??H"8X66 ''11 	     "#0ABBBC,,,C'++1r)"&&--88  D
 LL**5111s   &A##
A0/A0c                 n    t          |          }|                     |j        |j        |j        |          S r&   )r   r[   rU   r,   rK   )r=   responserX   s      r"   extract_linksz%LxmlParserLinkExtractor.extract_linksf   s8    ))""x|X->
 
 	
r$   c                 ,    |                      |          S )zcNormalize and filter extracted links

        The subclass should override it if necessary
        )rT   r=   rY   s     r"   _process_linksz&LxmlParserLinkExtractor._process_linksl   s    
 **5111r$   c                 @    | j         rt          || j                  S |S )N)key)r   unique_listr<   r`   s     r"   rT   z.LxmlParserLinkExtractor._deduplicate_if_neededs   s&    ; 	9u$-8888r$   )r1   r2   NFTF)	__name__
__module____qualname__rA   rI   r[   r^   ra   rT   r'   r$   r"   r0   r0   /   s         
 
 
 
$4 4 42 2 24
 
 
2 2 2    r$   r0   c                   d    e Zd Z e            Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd Zd	 Zd
 Zd Z	d Z
dS )LxmlLinkExtractorr'   r1   arear2   FTNc           
      d   t          t          |                    t          t          |                    }}t          t          t          j        |          t          t          j        |          |	|
||          | _        d t          |          D             | _        d t          |          D             | _        t          t          |                    | _	        t          t          |                    | _
        t          t          |                    | _        | xj        t          t          | j        j        t          |                              z  c_        |t           }|| _        d t          |          D             | _        d t          |          D             | _        d S )N)r!   r>   r   r?   r:   r@   c                 d    g | ]-}t          |t                    r|nt          j        |          .S r'   r   r   r   compile.0r(   s     r"   
<listcomp>z.LxmlLinkExtractor.__init__.<locals>.<listcomp>   s@     
 
 
@AAx((;AAbjmm
 
 
r$   c                 d    g | ]-}t          |t                    r|nt          j        |          .S r'   ro   rq   s     r"   rs   z.LxmlLinkExtractor.__init__.<locals>.<listcomp>   s@     
 
 
@AAx((;AAbjmm
 
 
r$   c                     h | ]}d |z   S ).r'   )rr   es     r"   	<setcomp>z-LxmlLinkExtractor.__init__.<locals>.<setcomp>   s    NNNAaNNNr$   c                 d    g | ]-}t          |t                    r|nt          j        |          .S r'   ro   rq   s     r"   rs   z.LxmlLinkExtractor.__init__.<locals>.<listcomp>   sC     
 
 
 Ax((;AAbjmm
 
 
r$   )setr   r0   r   r5   containslink_extractor	allow_resdeny_resallow_domainsdeny_domainstuplerestrict_xpathsmap_csstranslatorcss_to_xpathr   canonicalizedeny_extensionsrestrict_text)r=   allowdenyr   r   r   tagsattrsr   r   process_valuer   restrict_cssr:   r   s                  r"   rA   zLxmlLinkExtractor.__init__|   s   " +d++,,c+e2D2D.E.Ee5)400*E22!&
 
 

 
EPQVEWEW
 
 

 
EPQUEVEV
 
 
 !]!;!;<<L 9 9::$[%A%ABB#0+l2K2KLL!
 !
 	
  	10O(NN_1M1MNNN
 
 //
 
 
r$   c                    t          |j                  sdS | j        rt          |j        | j                  sdS | j        rt          |j        | j                  rdS t          |j                  }| j        rt          || j                  sdS | j        rt          || j                  rdS | j	        rt          || j	                  rdS | j        rt          |j        | j                  sdS dS )NFT)r   r,   r}   r   r~   r   r   r   r   r   r   r   text)r=   r-   
parsed_urls      r"   _link_allowedzLxmlLinkExtractor._link_allowed   s   TX&& 	5> 	(48T^"D"D 	5= 	Xdh>> 	5dh''
 	&<*'
 '
 	 5 	!7
DDU!V!V 	5 	$9,%
 %
 	 5 	hty$:L&M&M 	5tr$   c                 ,   | j         rt          | j                   sdS | j        rt          | j                  rdS | j        rfd| j        D             ndg}| j        rfd| j        D             ng }t          |          ot          |           S )NFc              3   B   K   | ]}|                               V  d S r&   searchrr   regexr,   s     r"   	<genexpr>z,LxmlLinkExtractor.matches.<locals>.<genexpr>   s/      ;;5U\\#;;;;;;r$   Tc              3   B   K   | ]}|                               V  d S r&   r   r   s     r"   r   z,LxmlLinkExtractor.matches.<locals>.<genexpr>   s/      ??%,,s##??????r$   )r   r   r   r}   r~   any)r=   r,   alloweddenieds    `  r"   matcheszLxmlLinkExtractor.matches   s     	&<S$BT&U&U 	5 	!7T=N!O!O 	5 ~;;;;DN;;;; 	
 DH=X????????VX7||/CKK/r$   c                       fd|D             } j         r|D ]}t          |j                  |_         j                            |          }|S )Nc                 >    g | ]}                     |          |S r'   )r   )rr   r(   r=   s     r"   rs   z4LxmlLinkExtractor._process_links.<locals>.<listcomp>   s,    ;;;qT%7%7%:%:;;;;r$   )r   r	   r,   r|   ra   )r=   rY   r-   s   `  r"   ra   z LxmlLinkExtractor._process_links   sd    ;;;;E;;; 	6 6 6+DH55#22599r$   c                 &     | j         j        |i |S r&   )r|   r[   )r=   argskwargss      r"   r[   z LxmlLinkExtractor._extract_links   s    1t"14B6BBBr$   c                 D   t                    }| j        rfd| j        D             }nj        g}g }|D ]L}|                     |j        j        |          }|                    |                     |                     M| j        j	        rt          |          S |S )av  Returns a list of :class:`~scrapy.link.Link` objects from the
        specified :class:`response <scrapy.http.Response>`.

        Only links that match the settings passed to the ``__init__`` method of
        the link extractor are returned.

        Duplicate links are omitted if the ``unique`` attribute is set to ``True``,
        otherwise they are returned.
        c                 D    g | ]}                     |          D ]}|S r'   )xpath)rr   r(   subdocr]   s      r"   rs   z3LxmlLinkExtractor.extract_links.<locals>.<listcomp>   sG       8>>RSCTCT 9?   r$   )r   r   rU   r[   r,   rK   extendra   r|   r   rd   )r=   r]   rX   docs	all_linksdocrY   s    `     r"   r^   zLxmlLinkExtractor.extract_links   s      )) 	'    $ 4  DD %&D	 	9 	9C''X\8;LhWWET00778888% 	*y)))r$   )r'   r'   r'   r'   r'   rj   rl   FTNNr'   TN)re   rf   rg   r   r   rA   r   r   ra   r[   r^   r'   r$   r"   ri   ri   y   s        #^%%N 0
 0
 0
 0
d  ,0 0 0  C C C    r$   ri   )+__doc__r5   	functoolsr   urllib.parser   r   lxmlr   parsel.csstranslatorr   
w3lib.htmlr   	w3lib.urlr	   r
   scrapy.linkr   scrapy.linkextractorsr   r   r   r   r   scrapy.utils.miscr   r   scrapy.utils.pythonr   rd   scrapy.utils.responser   scrapy.utils.urlr   r   r   XPathrQ   r#   r)   r.   r0   ri   r'   r$   r"   <module>r      s           * * * * * * * *       / / / / / / - - - - - - 7 7 7 7 7 7 7 7                    < ; ; ; ; ; ; ; 5 5 5 5 5 5 . . . . . . J J J J J J J J 1%%+j11     ; ; ;G G G G G G G GT| | | | | | | | | |r$   