
    3 d                         d Z ddlZddlmZmZmZmZ ddlT ddlmZm	Z	 ddl
mZ d Zd Zd	 Zdd
Zd Zd Zd Zd Zd Zd Z	 	 	 	 ddZdS )z
This module contains general purpose URL functions not found in the standard
library.

Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead.
    N)ParseResult	urldefragurlparse
urlunparse)*)_safe_chars_unquotepath)
to_unicodec                     t          |           j                                        sdS d |D             }t          fd|D                       S )z:Return True if the url belongs to any of the given domainsFc                 6    g | ]}|                                 S  )lower).0ds     0lib/python3.11/site-packages/scrapy/utils/url.py
<listcomp>z*url_is_from_any_domain.<locals>.<listcomp>   s     ***Qqwwyy***    c              3   T   K   | ]"}|k    p                     d |           V  #dS ).Nendswith)r   r   hosts     r   	<genexpr>z)url_is_from_any_domain.<locals>.<genexpr>   s>      HH1	6t}}WWW55HHHHHHr   )	parse_urlnetlocr   any)urldomainsr   s     @r   url_is_from_any_domainr      s`    S>> &&((D u**'***GHHHHHHHHHHr   c                 l    t          | |j        gt          t          |dg                     z             S )z2Return True if the url belongs to the given spiderallowed_domains)r   namelistgetattr)r   spiders     r   url_is_from_spiderr&      s7    !fk]T'&2CR"H"HIII  r   c                     t          |           j                                        t          fd|D                       S )z?Return True if the url ends with one of the extensions providedc              3   B   K   | ]}                     |          V  d S Nr   )r   extlowercase_paths     r   r   z(url_has_any_extension.<locals>.<genexpr>&   s1      BB~&&s++BBBBBBr   )r   pathr   r   )r   
extensionsr+   s     @r   url_has_any_extensionr.   #   s@    s^^(..00NBBBBzBBBBBBr   c                 j    t          | t                    r| S t          t          | |                    S )z\Return urlparsed url from the given argument (which could be an already
    parsed url)
    )
isinstancer   r   r
   )r   encodings     r   r   r   )   s3     #{## 
JsH--...r   c                     t          |           \  }}|                    d          s| S t          |d|dd                   S )a  
    Return the crawlable url according to:
    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    !_escaped_fragment_   N)r   
startswithadd_or_replace_parameter)r   defragfrags      r   escape_ajaxr:   2   sG    . S>>LFD??3 
#F,@$qrr(KKKr   c                     t          j        d| t           j                  }|st          |           }|j        rdnd}|| z   } | S )z=Add http as the default scheme if it is missing from the url.z^\w+://flagszhttp:zhttp://)rematchIr   r   )r   r?   partsschemes       r   add_http_if_no_schemerC   O   sK    HZBD111E !L7islJr   c                 ^    t          t          j        d| t          j                            S )Na  
            ^                   # start with...
            (
                \.              # ...a single dot,
                (
                    \. | [^/\.]+  # optionally followed by
                )?                # either a second dot or some characters
                |
                ~   # $HOME
            )?      # optional match of ".", ".." or ".blabla"
            /       # at least one "/" for a file path,
            .       # and something after the "/"
            r<   )boolr>   r?   VERBOSEstrings    r   _is_posix_pathrI   Z   s7    
 *	
 	
 	
  r   c                 x    t          t          j        d| t          j        t          j        z                      S )Nzg
            ^
            (
                [a-z]:\\
                | \\\\
            )
            r<   )rE   r>   r?   
IGNORECASErF   rG   s    r   _is_windows_pathrL   p   s>    
 -"*,
	
 
	
 
	
  r   c                 >    t          |           pt          |           S r)   )rI   rL   rG   s    r   _is_filesystem_pathrN      s    &!!=%5f%=%==r   c                 \    t          |           rt          |           S t          |           S )zWAdd an URL scheme if missing: file:// for filepath-like input or
    http:// otherwise.)rN   
any_to_urirC   )r   s    r   guess_schemerQ      s.     3 # %%%r   TFc                 r   t          |           }|j        }|s|r)|j        s|j        r|                    d          d         }|r5|j        r.|j        |j        fdv r|                    d|j         d          }t          |j        ||rdn|j	        |rdn|j
        |rdn|j        |rdn|j        f          S )a  Strip URL string from some of its components:

    - ``strip_credentials`` removes "user:password@"
    - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
      from http:// (resp. https://, ftp://) URLs
    - ``origin_only`` replaces path component with "/", also dropping
      query and fragment components ; it also strips credentials
    - ``strip_fragment`` drops any #fragment component
    @))httpP   )httpsi  )ftp   : /)r   r   usernamepasswordsplitportrB   replacer   r,   paramsqueryfragment)r   strip_credentialsstrip_default_portorigin_onlystrip_fragment
parsed_urlr   s          r   	strip_urlrj      s    $ #JF '[ '')2' c""2& ?jo ?z/ 4
 
 	?
 ^^$9
$9$92>>F3CCJO4BB:#43BB:#3 9BBj&9	
	 	 	r   r)   )TTFT)__doc__r>   urllib.parser   r   r   r   	w3lib.urlr   r	   scrapy.utils.pythonr
   r   r&   r.   r   r:   rC   rI   rL   rN   rQ   rj   r   r   r   <module>ro      sF    
			 E E E E E E E E E E E E     / / / / / / / / * * * * * *I I I  C C C/ / / /L L L:    ,   > > >& & & ( ( ( ( ( (r   