
    3 d                         d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	  ej
        e          Z G d d          Z G d d	e          Z G d
 de          ZdS )zS
Offsite Spider Middleware

See documentation in docs/topics/spider-middleware.rst
    N)signals)Request)urlparse_cachedc                   T    e Zd Zd Zed             Zd Zd ZdefdZ	d Z
d Zd	 Zd
S )OffsiteMiddlewarec                     || _         d S N)stats)selfr
   s     @lib/python3.11/site-packages/scrapy/spidermiddlewares/offsite.py__init__zOffsiteMiddleware.__init__   s    


    c                 |     | |j                   }|j                            |j        t          j                   |S )N)signal)r
   r   connectspider_opened)clscrawleros      r   from_crawlerzOffsiteMiddleware.from_crawler   s8    C8MNNNr   c                 (      fd|pdD             S )Nc              3   H   K   | ]}                     |          |V  d S r	   _filter).0rr   spiders     r   	<genexpr>z:OffsiteMiddleware.process_spider_output.<locals>.<genexpr>   s6      CCa4<<6+B+BCCCCCCCr    r   )r   responseresultr   s   `  `r   process_spider_outputz'OffsiteMiddleware.process_spider_output   s%    CCCCC6<RCCCCr   c                Z   K   |pd2 3 d {V }|                      ||          r|W V  #6 d S )Nr   r   )r   r    r!   r   r   s        r   process_spider_output_asyncz-OffsiteMiddleware.process_spider_output_async   s]      | 	 	 	 	 	 	 	!||Av&&  $||s   *returnc                    t          |t                    sdS |j        s|                     ||          rdS t	          |          j        }|ra|| j        vrX| j                            |           t          	                    d||dd|i           | j
                            d|           | j
                            d|           d	S )
NTz3Filtered offsite request to %(domain)r: %(request)s)domainrequestr   )extrazoffsite/domains)r   zoffsite/filteredF)
isinstancer   dont_filtershould_followr   hostnamedomains_seenaddloggerdebugr
   	inc_value)r   r(   r   r'   s       r   r   zOffsiteMiddleware._filter#   s    '7++ 	4 	$"4"4Wf"E"E 	4 ))2 	CfD$55 	C!!&)))LLE!g66(    
 J  !26 BBB
/???ur   c                     | j         }t          |          j        pd}t          |                    |                    S )N )
host_regexr   r-   boolsearch)r   r(   r   regexhosts        r   r,   zOffsiteMiddleware.should_follow4   s8    w''06BELL&&'''r   c                 4   t          |dd          }|st          j        d          S t          j        d          }t          j        d          }g }|D ]}||                    |          r!d| d}t	          j        |t                     ;|                    |          r!d| d}t	          j        |t                     q|	                    t          j
        |                     d	d
                    |           d}t          j        |          S )z<Override this method to implement a different offsite policyallowed_domainsNr4   z^https?://.*$z:\d+$zCallowed_domains accepts only domains, not URLs. Ignoring URL entry z in allowed_domains.zCallowed_domains accepts only domains without ports. Ignoring entry z	^(.*\.)?(|z)$)getattrrecompilematchwarningswarn
URLWarningr7   PortWarningappendescapejoin)	r   r   r;   url_patternport_patterndomainsr'   messager8   s	            r   get_host_regexz OffsiteMiddleware.get_host_regex:   sM   !&*;TBB 	":b>>!j!122z(++% 	2 	2F   (( 2G*0G G G  gz2222$$V,, 2C&,C C C  g{3333ry0011112SXXg..222z%   r   c                 `    |                      |          | _        t                      | _        d S r	   )rL   r5   setr.   )r   r   s     r   r   zOffsiteMiddleware.spider_openedV   s)    --f55EEr   N)__name__
__module____qualname__r   classmethodr   r"   r$   r6   r   r,   rL   r   r   r   r   r   r      s             [
D D D  
$    "( ( (! ! !8" " " " "r   r   c                       e Zd ZdS )rC   NrO   rP   rQ   r   r   r   rC   rC   [           Dr   rC   c                       e Zd ZdS )rD   NrT   r   r   r   rD   rD   _   rU   r   rD   )__doc__loggingr>   rA   scrapyr   scrapy.httpr   scrapy.utils.httpobjr   	getLoggerrO   r0   r   WarningrC   rD   r   r   r   <module>r^      s   
  				              0 0 0 0 0 0		8	$	$G" G" G" G" G" G" G" G"T	 	 	 	 	 	 	 		 	 	 	 	' 	 	 	 	 	r   