
    3 d                         d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
mZmZ  e j        e          Z G d de          Zd Zdd
ZdS )    N)RequestXmlResponse)Spider)gunzipgzip_magic_number)Sitemapsitemap_urls_from_robotsc                   J     e Zd ZdZdgZdgZdZ fdZd Zd Z	d Z
d	 Z xZS )
SitemapSpider ) parser   Fc                 *    t                      j        |i | g | _        | j        D ]S\  }}t	          |t
                    rt          | |          }| j                            t          |          |f           Td | j	        D             | _
        d S )Nc                 ,    g | ]}t          |          S r   )regex).0xs     6lib/python3.11/site-packages/scrapy/spiders/sitemap.py
<listcomp>z*SitemapSpider.__init__.<locals>.<listcomp>   s    >>>Qa>>>    )super__init___cbssitemap_rules
isinstancestrgetattrappendr   sitemap_follow_follow)selfakwrc	__class__s        r   r   zSitemapSpider.__init__   s    !"r"""	& 	, 	,DAq!S!! %D!$$IeAhh]++++>>$*=>>>r   c              #   L   K   | j         D ]}t          || j                  V  d S N)sitemap_urlsr   _parse_sitemap)r!   urls     r   start_requestszSitemapSpider.start_requests   s>      $ 	4 	4C#t2333333	4 	4r   c              #      K   |D ]}|V  dS )zThis method can be used to filter sitemap entries by their
        attributes, for example, you can filter locs with lastmod greater
        than a given date (see docs).
        Nr   )r!   entriesentrys      r   sitemap_filterzSitemapSpider.sitemap_filter    s,      
  	 	EKKKK	 	r   c              #     K   |j                             d          r8t          |j        |j                   D ]}t	          || j                  V  d S |                     |          }|#t                              dd|id| i           d S t          |          }| 
                    |          }|j        dk    rRt          || j                  D ]:t          fd	| j        D                       rt	          | j                  V  ;d S |j        d
k    rOt          || j                  D ];| j        D ]/\  }}|                              rt	          |          V   n0:d S d S )Nz/robots.txt)base_url)callbackz&Ignoring invalid sitemap: %(response)sresponsespider)extrasitemapindexc              3   B   K   | ]}|                               V  d S r(   )search)r   r   locs     r   	<genexpr>z/SitemapSpider._parse_sitemap.<locals>.<genexpr>;   s-      ??Q188C==??????r   urlset)r+   endswithr	   textr   r*   _get_sitemap_bodyloggerwarningr   r0   typeiterlocsitemap_alternate_linksanyr    r   r9   )	r!   r4   r+   bodysitr$   r%   r:   s	           @r   r*   zSitemapSpider._parse_sitemap(   s     <  // 	"/UUU A AcD,?@@@@@@@A A ))(33D <*#T*    
 A$$Q''Bv' 	""2t'CDD I IC????$,????? I%cD4GHHHHHHI I 8# ""2t'CDD " "C $	 " "188C== "")#":":"::::!E"" "" "r   c                     t          |t                    r|j        S t          |          rt	          |j                  S |j                            d          s|j                            d          r|j        S dS )zsReturn the sitemap body contained in the given response,
        or None if the response is not a sitemap.
        z.xmlz.xml.gzN)r   r   rF   r   r   r+   r=   )r!   r4   s     r   r?   zSitemapSpider._get_sitemap_bodyD   s     h,, 	!= X&& 	)(-((( <  (( 	!HL,A,A),L,L 	!= 	! 	!r   )__name__
__module____qualname__r)   r   r   rD   r   r,   r0   r*   r?   __classcell__)r&   s   @r   r   r      s        L"OMTN#? ? ? ? ?4 4 4  " " "8! ! ! ! ! ! !r   r   c                 X    t          | t                    rt          j        |           S | S r(   )r   r   recompile)r   s    r   r   r   Y   s'    !S z!}}Hr   Fc              #   P   K   | D ] }|d         V  |rd|v r|d         E d {V  !d S )Nr:   	alternater   )rH   altds      r   rC   rC   _   s_       & &h  	&;!# 	&~%%%%%%%& &r   )F)loggingrO   scrapy.httpr   r   scrapy.spidersr   scrapy.utils.gzr   r   scrapy.utils.sitemapr   r	   	getLoggerrJ   r@   r   r   rC   r   r   r   <module>r[      s     				 , , , , , , , , ! ! ! ! ! ! 5 5 5 5 5 5 5 5 B B B B B B B B		8	$	$J! J! J! J! J!F J! J! J!Z  & & & & & &r   