
    3 dDP                        d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4  ej5        e6          Z7 G d de8          Z9 G d d          Z: G d d          Z; G d d          Z< G d  d!          Z= G d" d#e$          Z>dS )$z@
Files Pipeline

See documentation in topics/media-pipeline.rst
    N)defaultdict)suppress)FTP)BytesIO)Path)DefaultDictOptionalSet)urlparse)ItemAdapter)deferthreads)IgnoreRequestNotConfigured)Request)NO_CALLBACK)MediaPipeline)Settings)is_botocore_available)CaselessDict)ftp_store_file)failure_to_exc_info)md5sum)to_bytes)referer_strc                       e Zd ZdZdS )FileExceptionzGeneral media error exceptionN)__name__
__module____qualname____doc__     6lib/python3.11/site-packages/scrapy/pipelines/files.pyr   r   '   s        ''''r#   r   c                   b    e Zd ZdefdZddefdZdefdZdedefdZdd	ed
e	e         fdZ
dS )FSFilesStorebasedirc                     d|v r|                     dd          d         }|| _        |                     t          | j                             t	          t
                    | _        d S )Nz://   )splitr'   _mkdirr   r   setcreated_directories)selfr'   s     r$   __init__zFSFilesStore.__init__,   s`    G 	1mmE1--a0GD&&'''?J3?O?O   r#   Npathc                     |                      |          }|                     |j        |           |                    |                                           d S N)_get_filesystem_pathr+   parentwrite_bytesgetvalue)r.   r0   bufinfometaheadersabsolute_paths          r$   persist_filezFSFilesStore.persist_file3   sN    11$77M($///!!#,,..11111r#   c                    |                      |          }	 |                                j        }n# t          j        $ r i cY S w xY w|                    d          5 }t          |          }d d d            n# 1 swxY w Y   ||dS )Nrblast_modifiedchecksum)r3   statst_mtimeoserroropenr   )r.   r0   r8   r;   r@   frA   s          r$   	stat_filezFSFilesStore.stat_file8   s    11$77	)..009MMx 	 	 	III	 %% 	!ayyH	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! "/HEEEs!   1 AAA99A= A=returnc                 P    |                     d          }t          | j        g|R  S )N/)r*   r   r'   )r.   r0   
path_compss      r$   r3   z!FSFilesStore._get_filesystem_pathD   s)    ZZ__
DL.:....r#   dirnamedomainc                     |r| j         |         nt                      }t          |          |vrN|                                s|                    d           |                    t          |                     d S d S )NT)parents)r-   r,   strexistsmkdiradd)r.   rM   rN   seens       r$   r+   zFSFilesStore._mkdirH   s~    39Dt'//suuw<<t# 	#>>## ,d+++HHS\\"""""	# 	#r#   NNr2   )r   r   r    rQ   r/   r<   rH   r   r3   r	   r+   r"   r#   r$   r&   r&   +   s        P P P P P2 2 2 2 2 2

Fc 
F 
F 
F 
F/ / / / / /# #d #HSM # # # # # #r#   r&   c                   V    e Zd ZdZdZdZdZdZdZdZ	dZ
ddiZd Zd Zd Zd
dZd	 ZdS )S3FilesStoreNprivateCache-Controlmax-age=172800c           
         t                      st          d          dd l}|j                                        }|                    d| j        | j        | j        | j	        | j
        | j        | j                  | _        |                    d          st          d| d          |dd                              d	d
          \  | _        | _        d S )Nzmissing botocore libraryr   s3)aws_access_key_idaws_secret_access_keyaws_session_tokenendpoint_urlregion_nameuse_sslverifyzs3://Incorrect URI scheme in z, expected 's3'   rK   r)   )r   r   botocore.sessionsessionget_sessioncreate_clientAWS_ACCESS_KEY_IDAWS_SECRET_ACCESS_KEYAWS_SESSION_TOKENAWS_ENDPOINT_URLAWS_REGION_NAMEAWS_USE_SSL
AWS_VERIFY	s3_client
startswith
ValueErrorr*   bucketprefix)r.   uribotocorerh   s       r$   r/   zS3FilesStore.__init__^   s    $&& 	< :;;;"..00 .."4"&"<"4.,$? / 	
 	
 ~~g&& 	NLLLLMMM#&qrr7==a#8#8 T[[[r#   c                 X    d }|                      |                              |          S )Nc                     | d                              d          }| d         }t          j        |                                          }||dS )NETag"LastModifiedrA   r@   )striptimemktime	timetuple)boto_keyrA   r@   modified_stamps       r$   
_onsuccessz*S3FilesStore.stat_file.<locals>._onsuccesss   sM    '--c22H$^4M![)@)@)B)BCCN (>JJJr#   )_get_boto_keyaddCallback)r.   r0   r8   r   s       r$   rH   zS3FilesStore.stat_filer   s8    	K 	K 	K !!$''33J???r#   c                 d    | j          | }t          j        | j        j        | j        |          S )N)BucketKey)rv   r   deferToThreadrr   head_objectru   )r.   r0   key_names      r$   r   zS3FilesStore._get_boto_key{   s<    k)4))$N&t{
 
 
 	
r#   c           	      ^   | j          | }|                    d           |                     | j                  }|r(|                    |                     |                     t          j        | j        j        f| j	        ||d |pi 
                                D             | j        d|S )zUpload file to S3 storager   c                 4    i | ]\  }}|t          |          S r"   rQ   .0kvs      r$   
<dictcomp>z-S3FilesStore.persist_file.<locals>.<dictcomp>   s$    AAADAqaQAAAr#   )r   r   BodyMetadataACL)rv   seek_headers_to_botocore_kwargsHEADERSupdater   r   rr   
put_objectru   itemsPOLICY)r.   r0   r7   r8   r9   r:   r   extras           r$   r<   zS3FilesStore.persist_file   s    k)4))00>> 	DLL99'BBCCC$N%
;AATZR,>,>,@,@AAA
 
 
 
 	
r#   c                 6   t          i ddddddddd	d
ddddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*          }i }|                                D ]4\  }}	 ||         }|||<   # t          $ r t          d+| d,          w xY w|S )-z.Convert headers to botocore keyword arguments.Content-TypeContentTyperZ   CacheControlzContent-DispositionContentDispositionzContent-EncodingContentEncodingzContent-LanguageContentLanguagezContent-LengthContentLengthzContent-MD5
ContentMD5ExpireszX-Amz-Grant-Full-ControlGrantFullControlzX-Amz-Grant-Read	GrantReadzX-Amz-Grant-Read-ACPGrantReadACPzX-Amz-Grant-Write-ACPGrantWriteACPzX-Amz-Object-Lock-Legal-HoldObjectLockLegalHoldStatuszX-Amz-Object-Lock-ModeObjectLockModez#X-Amz-Object-Lock-Retain-Until-DateObjectLockRetainUntilDatezX-Amz-Request-PayerRequestPayerzX-Amz-Server-Side-EncryptionServerSideEncryptionSSEKMSKeyIdSSEKMSEncryptionContextSSECustomerAlgorithmSSECustomerKeySSECustomerKeyMD5StorageClassTaggingWebsiteRedirectLocation)z+X-Amz-Server-Side-Encryption-Aws-Kms-Key-Idz$X-Amz-Server-Side-Encryption-Contextz/X-Amz-Server-Side-Encryption-Customer-Algorithmz)X-Amz-Server-Side-Encryption-Customer-Keyz-X-Amz-Server-Side-Encryption-Customer-Key-Md5zX-Amz-Storage-ClasszX-Amz-TaggingzX-Amz-Website-Redirect-LocationzHeader "z" is not supported by botocore)r   r   KeyError	TypeError)r.   r:   mappingr   keyvaluekwargs          r$   r   z(S3FilesStore._headers_to_botocore_kwargs   s     &'; #$5	
 #$5 !/ | 9 +,> #K ' ( /0K )*: 67R  &~!" /0F#$ @M8QCY=MAT'5!*3L3  
 
: !--// 	% 	%JC%  %e  P P P N3 N N NOOOP s   *A88BrV   )r   r   r    rk   rl   rm   rn   ro   rp   rq   r   r   r/   rH   r   r<   r   r"   r#   r$   rX   rX   P   s         OKJF)G9 9 9(@ @ @
 
 

 
 
 
"( ( ( ( (r#   rX   c                   :    e Zd ZdZdZdZd Zd Zd Zd Z	ddZ
dS )	GCSFilesStoreNr[   c                    ddl m} |                    | j                  }|dd                              dd          \  }}|                    |          | _        || _        | j                            ddg          }d|vrt          	                    d	d
|i           d|vrt          
                    dd
|i           d S d S )Nr   )storage)projectrf   rK   r)   zstorage.objects.getzstorage.objects.createzNo 'storage.objects.get' permission for GSC bucket %(bucket)s. Checking if files are up to date will be impossible. Files will be downloaded every time.ru   zbNo 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!)google.cloudr   ClientGCS_PROJECT_IDr*   ru   rv   test_iam_permissionsloggerwarningrE   )r.   rw   r   clientru   rv   permissionss          r$   r/   zGCSFilesStore.__init__   s    (((((((;<<QRRsA..mmF++k66"$<=
 
 !3 	NNl6"  
 $;6 	LLt6"    	 	r#   c                     d }|                      |          }t          j        | j        j        |                              |          S )Nc                 x    | r7| j         }t          j        | j                                                  }||dS i S )Nr~   )md5_hashr   r   updatedr   )blobrA   r@   s      r$   r   z+GCSFilesStore.stat_file.<locals>._onsuccess   sB     N= $DL,B,B,D,D E E$,}MMMIr#   )_get_blob_pathr   r   ru   get_blobr   )r.   r0   r8   r   	blob_paths        r$   rH   zGCSFilesStore.stat_file   sS    	 	 	 ''--	$T[%99EEQQ
 
 	
r#   c                 "    |rd|v r|d         S dS )Nr   zapplication/octet-streamr"   )r.   r:   s     r$   _get_content_typezGCSFilesStore._get_content_type   s'     	+~0 	+>**))r#   c                     | j         |z   S r2   )rv   )r.   r0   s     r$   r   zGCSFilesStore._get_blob_path   s    {T!!r#   c                 P   |                      |          }| j                            |          }| j        |_        d |pi                                 D             |_        t          j        |j	        |
                                |                     |          | j                  S )Nc                 4    i | ]\  }}|t          |          S r"   r   r   s      r$   r   z.GCSFilesStore.persist_file.<locals>.<dictcomp>   s$    DDDtq!CFFDDDr#   )datacontent_typepredefined_acl)r   ru   r   CACHE_CONTROLcache_controlr   metadatar   r   upload_from_stringr6   r   r   )r.   r0   r7   r8   r9   r:   r   r   s           r$   r<   zGCSFilesStore.persist_file   s    ''--	{	**!/DD
/A/A/C/CDDD$#//88;	
 
 
 	
r#   rV   )r   r   r    r   r   r   r/   rH   r   r   r<   r"   r#   r$   r   r      su        N$M F  ,
 
 
* * *
" " "

 

 

 

 

 

r#   r   c                   .    e Zd ZdZdZdZd ZddZd ZdS )FTPFilesStoreNc                 d   |                     d          st          d| d          t          |          }|j        | _        |j        | _        t          |j        pd          | _        |j        p| j        | _        |j	        p| j
        | _	        |j                            d          | _        d S )Nzftp://re   z, expected 'ftp'   rK   )rs   rt   r   porthostnamehostintusernameFTP_USERNAMEpasswordFTP_PASSWORDr0   rstripr'   )r.   rw   us      r$   r/   zFTPFilesStore.__init__  s    ~~h'' 	OMMMMNNNSMMF	J	"%%	
7d&7
7d&7v}}S))r#   c           
          | j          d| }t          j        t          ||| j        | j        | j        | j        | j                  S )NrK   )r0   filer   r   r   r   use_active_mode)	r'   r   r   r   r   r   r   r   USE_ACTIVE_MODE)r.   r0   r7   r8   r9   r:   s         r$   r<   zFTPFilesStore.persist_file  sT    ,''''$]] 0	
 	
 	
 		
r#   c                 8      fd}t          j        ||          S )Nc                 $   	 t                      }|                    j        j                   |                    j        j                   j        r|                    d           j	         d|  }t          |                    d|           dd                                                    }t          j                    }|                    d| |j                   ||                                dS # t$          $ r i cY S w xY w)NFrK   zMDTM    zRETR r?   )r   connectr   r   loginr   r   r   set_pasvr'   floatvoidcmdr   hashlibmd5
retrbinaryr   	hexdigest	Exception)r0   ftp	file_pathr@   mr.   s        r$   
_stat_filez+FTPFilesStore.stat_file.<locals>._stat_file  s   eeDIty111		$-777' (LL'''#|44d44	 %ckk2E)2E2E&F&Fqrr&J&P&P&R&R S SKMM2y22AH===)6AKKMMRRR   			s   C<D   DD)r   r   )r.   r0   r8   r  s   `   r$   rH   zFTPFilesStore.stat_file  s1    	 	 	 	 	  $Z666r#   rV   )	r   r   r    r   r   r   r/   r<   rH   r"   r#   r$   r   r      sU        LLO	* 	* 	*
 
 
 
7 7 7 7 7r#   r   c                        e Zd ZdZdZdZeeeee	dZ
dZdZd fd	Zed	             Zd
efdZdddZd ZdddZd Zd ZdddZd ZddddZ xZS )FilesPipelinea  Abstract pipeline that implement the file downloading

    This pipeline tries to minimize network transfers and file processing,
    doing stat of the files and determining if file is new, up-to-date or
    expired.

    ``new`` files are those that pipeline never processed and needs to be
        downloaded from supplier site the first time.

    ``uptodate`` files are the ones that the pipeline processed and are still
        valid files.

    ``expired`` files are those that pipeline already processed but the last
        modification was made long time ago, so a reprocessing is recommended to
        refresh it in case of change.

    r   Z   ) r   r]   gsr  	file_urlsfilesNc                    |st           t          |t                    s|t          |          }d}|                     |          | _        t          j        | j        ||          }|	                     |d          | j
                  | _        t          | d          s| j        | _        t          | d          s| j        | _        |                     |d          | j                  | _        |                     |d          | j                  | _        t)                                          ||           d S )Nr  )base_class_namesettingsFILES_EXPIRESFILES_URLS_FIELDFILES_RESULT_FIELD)download_funcr  )r   
isinstancedictr   
_get_storestore	functoolspartial_key_for_pipegetintEXPIRESexpireshasattrDEFAULT_FILES_URLS_FIELDr  DEFAULT_FILES_RESULT_FIELDr  getfiles_urls_fieldfiles_result_fieldsuperr/   )r.   	store_urir  r  cls_nameresolve	__class__s         r$   r/   zFilesPipeline.__init__P  sJ    	 h%% 	* 	*))H"__Y//
#8
 
 
  ww'?'?NNt/00 	B$($AD!t122 	F&*&ED# (G&'')>!
 !
 #+,,G())4+B#
 #
 	}xHHHHHr#   c                    | j         d         }|d         |_        |d         |_        |d         |_        |d         |_        |d         |_        |d         |_        |d         |_        |d	         |_        | j         d
         }|d         |_	        |d         pd |_        | j         d         }|d         |_
        |d         |_        |                    d          |_        |d         } | ||          S )Nr]   rk   rl   rm   rn   ro   rp   rq   FILES_STORE_S3_ACLr
  r   FILES_STORE_GCS_ACLr  FTP_USERr   FEED_STORAGE_FTP_ACTIVEFILES_STORE)r  )STORE_SCHEMESrk   rl   rm   rn   ro   rp   rq   r   r   r   r   getboolr   )clsr  s3store	gcs_store	ftp_storer%  s         r$   from_settingszFilesPipeline.from_settingsj  s   #D)$,-@$A!(01H(I%$,-@$A!#+,>#? "*+<"=&}5%l3!"67%d+	#+,<#=	 #$9:Bd	%e,	!)*!5	!).!9	$,$4$45N$O$O	!]+	s9x0000r#   rw   c                     t          |                                          rd}nt          |          j        }| j        |         } ||          S )Nr   )r   is_absoluter   schemer/  )r.   rw   r8  	store_clss       r$   r  zFilesPipeline._get_store  sL    99  "" 	*FFc]])F&v.	y~~r#   itemc                      fd}                      |          t          j         j        j                  }|                    |d            |                     fd           |S )Nc                    | sd S |                      dd           }|sd S t          j                    |z
  }|dz  dz  dz  }|	j        k    rd S t                    }t                              d	j        |ddj        i           	                    j        d           |                      d	d           }j	        |dd
S )Nr@   <      zTFile (uptodate): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>)	medianamerequestrefererspiderr   uptodaterA   urlr0   rA   status)
r!  r   r  r   r   debug
MEDIA_NAMErC  	inc_statsrG  )
resultr@   age_secondsage_daysrB  rA   r8   r0   rA  r.   s
         r$   r   z3FilesPipeline.media_to_download.<locals>._onsuccess  s     "JJ==M  )++5K"R'",r1H$,& !'**GLL,"o'gVV-	     NN4;
333zz*d33H{$$	  r#   )r8   r;  c                     d S r2   r"   )_s    r$   <lambda>z1FilesPipeline.media_to_download.<locals>.<lambda>  s    t r#   c                     t                               j        j        dz   t	          |           dj        i          S )Nz.store.stat_filerC  exc_infor   )r   rE   r(  r   r   rC  )rG   r8   r.   s    r$   rQ  z1FilesPipeline.media_to_download.<locals>.<lambda>  s=    fll'*<<,Q//- #   r#   )r  r   maybeDeferredr  rH   addCallbacks
addErrback)r.   rA  r8   r;  r   dfdr0   s   ```   @r$   media_to_downloadzFilesPipeline.media_to_download  s    	 	 	 	 	 	 	 	< ~~gDt~<<!$*"6dCC^^444    	
 	
 	
 
r#   c                     t          |j        t                    sBt          |          }t                              d| j        |||j        dd|j        i           t          )NzoFile (unknown-error): Error downloading %(medianame)s from %(request)s referred in <%(referer)s>: %(exception)s)r@  rA  rB  	exceptionrC  rD  )	r  r   r   r   r   r   rJ  rC  r   )r.   failurerA  r8   rB  s        r$   media_failedzFilesPipeline.media_failed  st    '-77 	!'**GNNG "&&&!(	   -  
 
 
 r#   c          	      |   t          |          }|j        dk    r<t                              d|j        ||dd|j        i           t          d          |j        s6t                              d||dd|j        i           t          d	          d
|j        v rd
nd}t                              d|||dd|j        i           | 	                    |j        |           	 | 
                    ||||          }|                     ||||          }n# t
          $ r<}	t                              d||t          |	          dd|j        id            d }	~	wt          $ rI}	t                              d||ddd|j        i           t          t          |	                    d }	~	ww xY w|j        |||dS )N   zZFile (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>)rH  rA  rB  rC  rD  zdownload-errorzWFile (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content)rA  rB  zempty-contentcached
downloadedzMFile (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>responser8   r;  r:  z\File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s)rA  rB  errormsgT)r   rT  zVFile (unknown-error): Error processing file from %(request)s referred in <%(referer)s>rS  rF  )r   rH  r   r   rC  r   bodyflagsrI  rK  r  file_downloadedrQ   r  rE   rG  )
r.   rc  rA  r8   r;  rB  rH  r0   rA   excs
             r$   media_downloadedzFilesPipeline.media_downloaded  s>   g&&?c! 	2NN8#?w7SS-	       0111} 	1NN/#88-	      000%7I\'gFFT[)	 	 	
 	
 	
 	t{F+++	*>>'H4d>SSD++Hgt$+OOHH 	 	 	NN:#SXXNN-      	* 	* 	*LL,#88-      C)))	* ; 	
 
 	
s%   %2D 
F/"7EF/&AF**F/c                     |j         j                            d|           |j         j                            d| |           d S )N
file_count)rC  zfile_status_count/)crawlerstats	inc_value)r.   rC  rH  s      r$   rK  zFilesPipeline.inc_stats  sM    &&|F&CCC&&'DF'D'DV&TTTTTr#   c                 j    t          |                              | j        g           }d |D             S )Nc                 :    g | ]}t          |t                     S ))callback)r   r   )r   r   s     r$   
<listcomp>z4FilesPipeline.get_media_requests.<locals>.<listcomp>  s%    ???QK000???r#   )r   r!  r"  )r.   r;  r8   urlss       r$   get_media_requestsz FilesPipeline.get_media_requests  s6    4  $$T%:B????$????r#   c                    |                      ||||          }t          |j                  }t          |          }|                    d           | j                            |||           |S )Nrb  r   )r  r   re  r   r   r  r<   )r.   rc  rA  r8   r;  r0   r7   rA   s           r$   rg  zFilesPipeline.file_downloaded
  sc    ~~gt$~OOhm$$#;;
c4000r#   c                     t          t                    5  d |D             t          |          | j        <   d d d            n# 1 swxY w Y   |S )Nc                     g | ]	\  }}||
S r"   r"   )r   okxs      r$   rr  z0FilesPipeline.item_completed.<locals>.<listcomp>  s"    9W9W9WATV9W!9W9W9Wr#   )r   r   r   r#  )r.   resultsr;  r8   s       r$   item_completedzFilesPipeline.item_completed  s    h 	X 	X9W9W9W9W9WKd56	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	Xs   "AA
Ac                <   t          j        t          |j                                                            }t          |j                  j        }|t          j        vr7d}t          j	        |j                  d         }|rt          j
        |          }d| | S )Nr	  r   zfull/)r   sha1r   rG  r   r   suffix	mimetypes	types_map
guess_typeguess_extension)r.   rA  rc  r8   r;  
media_guid	media_ext
media_types           r$   r  zFilesPipeline.file_path  s    \(7;"7"788BBDD
%%,	 I// 	BI"-gk::1=J B%5jAA	.z.9...r#   rV   )r   r   r    r!   rJ  r  r&   rX   r   r   r/  r  r   r/   classmethodr5  rQ   r  rY  r]  ri  rK  rt  rg  r{  r  __classcell__)r(  s   @r$   r  r  1  sz        $ JG M  +!(I I I I I I4 1 1 [1.c     8< ) ) ) ) )V  " AE 9
 9
 9
 9
 9
vU U U
@ @ @ @D       

/4 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/ 
/r#   r  )?r!   r  r   loggingr  rD   r   collectionsr   
contextlibr   ftplibr   ior   pathlibr   typingr   r	   r
   urllib.parser   itemadapterr   twisted.internetr   r   scrapy.exceptionsr   r   scrapy.httpr   scrapy.http.requestr   scrapy.pipelines.mediar   scrapy.settingsr   scrapy.utils.botor   scrapy.utils.datatypesr   scrapy.utils.ftpr   scrapy.utils.logr   scrapy.utils.miscr   scrapy.utils.pythonr   scrapy.utils.requestr   	getLoggerr   r   r  r   r&   rX   r   r   r  r"   r#   r$   <module>r     s   
           				  # # # # # #                         - - - - - - - - - - ! ! ! ! ! ! # # # # # # + + + + + + + + : : : : : : : :       + + + + + + 0 0 0 0 0 0 $ $ $ $ $ $ 3 3 3 3 3 3 / / / / / / + + + + + + 0 0 0 0 0 0 $ $ $ $ $ $ ( ( ( ( ( ( , , , , , ,		8	$	$( ( ( ( (I ( ( ("# "# "# "# "# "# "# "#Jj j j j j j j jZ?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
D/7 /7 /7 /7 /7 /7 /7 /7dp/ p/ p/ p/ p/M p/ p/ p/ p/ p/r#   