
    3 d6                        d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ  ej        e          Z G d de          Z G d de          Z e	dd          Z G d de          ZdS )    N)abstractmethod)Path)OptionalTypeTypeVar)Deferred)Crawler)Request)Spider)job_dir)create_instanceload_objectc                       e Zd ZdZd Zd ZdS )BaseSchedulerMetazN
    Metaclass to check scheduler classes against the necessary interface
    c                 F    |                      t          |                    S N)__subclasscheck__type)clsinstances     5lib/python3.11/site-packages/scrapy/core/scheduler.py__instancecheck__z#BaseSchedulerMeta.__instancecheck__   s    $$T(^^444    c                     t          |d          o[t          |j                  oGt          |d          o7t          |j                  o#t          |d          ot          |j                  S )Nhas_pending_requestsenqueue_requestnext_request)hasattrcallabler   r   r   )r   subclasss     r   r   z#BaseSchedulerMeta.__subclasscheck__   sy    H455 06770"3440 1220 .11	0
 .//	
r   N)__name__
__module____qualname____doc__r   r    r   r   r   r      s<         5 5 5
 
 
 
 
r   r   c                       e Zd ZdZedefd            Zdedee	         fdZ
dedee	         fdZedefd	            Zed
edefd            Zedee         fd            ZdS )BaseSchedulera7  
    The scheduler component is responsible for storing requests received from
    the engine, and feeding them back upon request (also to the engine).

    The original sources of said requests are:

    * Spider: ``start_requests`` method, requests created for URLs in the ``start_urls`` attribute, request callbacks
    * Spider middleware: ``process_spider_output`` and ``process_spider_exception`` methods
    * Downloader middleware: ``process_request``, ``process_response`` and ``process_exception`` methods

    The order in which the scheduler returns its stored requests (via the ``next_request`` method)
    plays a great part in determining the order in which those requests are downloaded.

    The methods defined in this class constitute the minimal interface that the Scrapy engine will interact with.
    crawlerc                      |             S )zp
        Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument.
        r%   )r   r(   s     r   from_crawlerzBaseScheduler.from_crawler6   s    
 suur   spiderreturnc                     dS )a  
        Called when the spider is opened by the engine. It receives the spider
        instance as argument and it's useful to execute initialization code.

        :param spider: the spider object for the current crawl
        :type spider: :class:`~scrapy.spiders.Spider`
        Nr%   selfr+   s     r   openzBaseScheduler.open=   	     	r   reasonc                     dS )a&  
        Called when the spider is closed by the engine. It receives the reason why the crawl
        finished as argument and it's useful to execute cleaning code.

        :param reason: a string which describes the reason why the spider was closed
        :type reason: :class:`str`
        Nr%   )r/   r2   s     r   closezBaseScheduler.closeG   r1   r   c                     t                      )zV
        ``True`` if the scheduler has enqueued requests, ``False`` otherwise
        NotImplementedErrorr/   s    r   r   z"BaseScheduler.has_pending_requestsQ   s    
 "###r   requestc                     t                      )a  
        Process a request received by the engine.

        Return ``True`` if the request is stored correctly, ``False`` otherwise.

        If ``False``, the engine will fire a ``request_dropped`` signal, and
        will not make further attempts to schedule the request at a later time.
        For reference, the default Scrapy scheduler returns ``False`` when the
        request is rejected by the dupefilter.
        r6   r/   r9   s     r   r   zBaseScheduler.enqueue_requestX   s     "###r   c                     t                      )a  
        Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
        to indicate that there are no requests to be considered ready at the moment.

        Returning ``None`` implies that no request from the scheduler will be sent
        to the downloader in the current reactor cycle. The engine will continue
        calling ``next_request`` until ``has_pending_requests`` is ``False``.
        r6   r8   s    r   r   zBaseScheduler.next_requestf   s     "###r   N)r!   r"   r#   r$   classmethodr	   r*   r   r   r   r0   strr4   r   boolr   r
   r   r   r%   r   r   r'   r'   %   s          7    [6 hx&8    C HX$6     $d $ $ $ ^$ $w $4 $ $ $ ^$ 	$hw/ 	$ 	$ 	$ ^	$ 	$ 	$r   r'   )	metaclassSchedulerTV	Scheduler)boundc                      e Zd ZdZ	 	 	 	 	 	 	 ddee         dedee         fdZe	de
e         d	efd
            Zd	efdZded	ee         fdZded	ee         fdZded	efdZd	ee         fdZd	efdZded	efdZded	dfdZd	ee         fdZd Zd Zdee         d	ee         fdZded	efdZdeded	dfdZdS )rB   a
  
    Default Scrapy scheduler. This implementation also handles duplication
    filtering via the :setting:`dupefilter <DUPEFILTER_CLASS>`.

    This scheduler stores requests into several priority queues (defined by the
    :setting:`SCHEDULER_PRIORITY_QUEUE` setting). In turn, said priority queues
    are backed by either memory or disk based queues (respectively defined by the
    :setting:`SCHEDULER_MEMORY_QUEUE` and :setting:`SCHEDULER_DISK_QUEUE` settings).

    Request prioritization is almost entirely delegated to the priority queue. The only
    prioritization performed by this scheduler is using the disk-based queue if present
    (i.e. if the :setting:`JOBDIR` setting is defined) and falling back to the memory-based
    queue if a serialization error occurs. If the disk queue is not present, the memory one
    is used directly.

    :param dupefilter: An object responsible for checking and filtering duplicate requests.
                       The value for the :setting:`DUPEFILTER_CLASS` setting is used by default.
    :type dupefilter: :class:`scrapy.dupefilters.BaseDupeFilter` instance or similar:
                      any class that implements the `BaseDupeFilter` interface

    :param jobdir: The path of a directory to be used for persisting the crawl's state.
                   The value for the :setting:`JOBDIR` setting is used by default.
                   See :ref:`topics-jobs`.
    :type jobdir: :class:`str` or ``None``

    :param dqclass: A class to be used as persistent request queue.
                    The value for the :setting:`SCHEDULER_DISK_QUEUE` setting is used by default.
    :type dqclass: class

    :param mqclass: A class to be used as non-persistent request queue.
                    The value for the :setting:`SCHEDULER_MEMORY_QUEUE` setting is used by default.
    :type mqclass: class

    :param logunser: A boolean that indicates whether or not unserializable requests should be logged.
                     The value for the :setting:`SCHEDULER_DEBUG` setting is used by default.
    :type logunser: bool

    :param stats: A stats collector object to record stats about the request scheduling process.
                  The value for the :setting:`STATS_CLASS` setting is used by default.
    :type stats: :class:`scrapy.statscollectors.StatsCollector` instance or similar:
                 any class that implements the `StatsCollector` interface

    :param pqclass: A class to be used as priority queue for requests.
                    The value for the :setting:`SCHEDULER_PRIORITY_QUEUE` setting is used by default.
    :type pqclass: class

    :param crawler: The crawler object corresponding to the current crawl.
    :type crawler: :class:`scrapy.crawler.Crawler`
    NFjobdirlogunserr(   c	                     || _         |                     |          | _        || _        || _        || _        || _        || _        || _        d S r   )	df_dqdirdqdirpqclassdqclassmqclassrF   statsr(   )	r/   
dupefilterrE   rL   rM   rF   rN   rK   r(   s	            r   __init__zScheduler.__init__   sL     [[((
 
r   r   r,   c                 r   t          |j        d                   } | t          ||j        |          t          |j                  t          |j        d                   t          |j        d                   |j                            d          |j        t          |j        d                   |          S )zh
        Factory method, initializes the scheduler with arguments taken from the crawl settings
        DUPEFILTER_CLASSSCHEDULER_DISK_QUEUESCHEDULER_MEMORY_QUEUESCHEDULER_DEBUGSCHEDULER_PRIORITY_QUEUE)rO   rE   rL   rM   rF   rN   rK   r(   )r   settingsr   r   getboolrN   )r   r(   dupefilter_clss      r   r*   zScheduler.from_crawler   s    
 %W%56H%IJJs&~w7GQQ7+,, 01G HII 01I JKK%--.?@@- 01K LMM	
 	
 	
 		
r   c                 (    t          |           dk    S )Nr   )lenr8   s    r   r   zScheduler.has_pending_requests   s    4yy1}r   r+   c                     || _         |                                 | _        | j        r|                                 nd| _        | j                                        S )z
        (1) initialize the memory queue
        (2) initialize the disk queue if the ``jobdir`` attribute is a valid directory
        (3) return the result of the dupefilter's ``open`` method
        N)r+   _mqmqsrJ   _dqdqsrH   r0   r.   s     r   r0   zScheduler.open   sD     88::!%5488:::w||~~r   r2   c                     | j         P| j                                         }t          | j        t                    sJ |                     | j        |           | j                            |          S )z
        (1) dump pending requests to disk if there is a disk queue
        (2) return the result of the dupefilter's ``close`` method
        )r`   r4   
isinstancerJ   r>   _write_dqs_staterH   )r/   r2   states      r   r4   zScheduler.close   sc    
 8 	5HNN$$Edj#.....!!$*e444w}}V$$$r   r9   c                    |j         s<| j                            |          r"| j                            || j                   dS |                     |          }|r"| j                            d| j                   n6|                     |           | j                            d| j                   | j                            d| j                   dS )a  
        Unless the received request is filtered out by the Dupefilter, attempt to push
        it into the disk queue, falling back to pushing it into the memory queue.

        Increment the appropriate stats, such as: ``scheduler/enqueued``,
        ``scheduler/enqueued/disk``, ``scheduler/enqueued/memory``.

        Return ``True`` if the request was stored successfully, ``False`` otherwise.
        Fzscheduler/enqueued/diskr+   zscheduler/enqueued/memoryzscheduler/enqueuedT)	dont_filterrH   request_seenlogr+   _dqpushrN   	inc_value_mqpush)r/   r9   dqoks      r   r   zScheduler.enqueue_request   s     " 	tw';';G'D'D 	GKK---5||G$$ 	RJ  !:4; OOOOLL!!!J  !<T[ QQQ
1$+FFFtr   c                 4   | j                                         }|"| j                            d| j                   n7|                                 }|!| j                            d| j                   |!| j                            d| j                   |S )aj  
        Return a :class:`~scrapy.http.Request` object from the memory queue,
        falling back to the disk queue if the memory queue is empty.
        Return ``None`` if there are no more enqueued requests.

        Increment the appropriate stats, such as: ``scheduler/dequeued``,
        ``scheduler/dequeued/disk``, ``scheduler/dequeued/memory``.
        Nzscheduler/dequeued/memoryrf   zscheduler/dequeued/diskzscheduler/dequeued)r^   poprN   rk   r+   _dqpopr;   s     r   r   zScheduler.next_request   s     (,,.. 	TJ  !<T[ QQQQkkmmG T
$$%>t{$SSS 	KJ  !5dk JJJr   c                     | j         )t          | j                   t          | j                  z   nt          | j                  S )z>
        Return the total amount of enqueued requests
        )r`   r[   r^   r8   s    r   __len__zScheduler.__len__  s3     15Ws48}}s48}},,#dh--Wr   c                 0   | j         dS 	 | j                             |           dS # t          $ rd}| j        r1d}t                              |||ddd| j        i           d| _        | j                            d| j                   Y d }~dS d }~ww xY w)	NFTzUnable to serialize request: %(request)s - reason: %(reason)s - no more unserializable requests will be logged (stats being collected))r9   r2   r+   )exc_infoextrazscheduler/unserializablerf   )	r`   push
ValueErrorrF   loggerwarningr+   rN   rk   )r/   r9   emsgs       r   rj   zScheduler._dqpush  s    8 	5	HMM'"""$ 4#  	 	 	} &6 
  '155!#T[1	     !&J  !;DK PPP55555	s   ' 
BABBc                 :    | j                             |           d S r   )r^   rv   r;   s     r   rl   zScheduler._mqpush/  s    gr   c                 F    | j         | j                                         S d S r   )r`   ro   r8   s    r   rp   zScheduler._dqpop2  s!    8 	"8<<>>!tr   c                 H    t          | j        d| j        | j        d          S )z<Create a new priority queue instance, with in-memory storageN )rW   r(   downstream_queue_clskey)r   rK   r(   rM   r8   s    r   r]   zScheduler._mq7  s.    LL!%
 
 
 	
r   c                     |                      | j                  }t          | j        d| j        | j        | j        |          }|r3t                              ddt          |          id| j	        i           |S )z7Create a new priority queue instance, with disk storageN)rW   r(   r   r   
startpriosz1Resuming crawl (%(queuesize)d requests scheduled)	queuesizer+   )ru   )
_read_dqs_staterJ   r   rK   r(   rL   rx   infor[   r+   )r/   rd   qs      r   r_   zScheduler._dqA  s    $$TZ00LL!%

 
 
  	KKCc!ff%-    
 r   c                     |It          |d          }|                                s|                    d           t          |          S dS )z0Return a folder name to keep disk queue state atNzrequests.queueT)parents)r   existsmkdirr>   )r/   rE   rJ   s      r   rI   zScheduler._dqdirT  sO     	!122E<<>> *D)))u::tr   rJ   c                     t          |d          }|                                sg S |                    d          5 }t          j        |          cd d d            S # 1 swxY w Y   d S )Nactive.jsonutf-8encoding)r   r   r0   jsonload)r/   rJ   pathfs       r   r   zScheduler._read_dqs_state]  s    E=)){{}} 	IYYY(( 	 A9Q<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s   AA"%A"rd   c                     t          |d                              dd          5 }t          j        ||           d d d            d S # 1 swxY w Y   d S )Nr   wr   r   )r   r0   r   dump)r/   rJ   rd   r   s       r   rc   zScheduler._write_dqs_stated  s    %'',,S7,CC 	 qIeQ	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s   A		AA)NNNFNNN) r!   r"   r#   r$   r   r>   r?   r	   rP   r=   r   rA   r*   r   r   r   r0   r4   r
   r   r   intrr   rj   rl   rp   r]   r_   rI   listr   rc   r%   r   r   rB   rB   v   sk       0 0j !%%)    '"   ( 
${+ 
 
 
 
 [
 d    	6 	hx&8 	 	 	 		%C 	%HX$6 	% 	% 	% 	%w 4    ,hw/    (X X X X Xw 4    0w 4    )    

 
 
  &Xc] x}     S  T         c  $  4            r   )r   loggingabcr   pathlibr   typingr   r   r   twisted.internet.deferr   scrapy.crawlerr	   scrapy.http.requestr
   scrapy.spidersr   scrapy.utils.jobr   scrapy.utils.miscr   r   	getLoggerr!   rx   r   r   r'   rA   rB   r%   r   r   <module>r      s                 * * * * * * * * * * + + + + + + " " " " " " ' ' ' ' ' ' ! ! ! ! ! ! $ $ $ $ $ $ : : : : : : : :		8	$	$
 
 
 
 
 
 
 
&K$ K$ K$ K$ K$/ K$ K$ K$ K$\ gm;777p  p  p  p  p  p  p  p  p  p r   