
    DUf!                     R   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	Z
ddlmZ ddlmZmZ ddlmZ g d	Zd
ddddddededededededee
j        e
j        f         fdZ	 d de
j        dede
j        fdZde
j        de
j        fdZd!dedede
j        fdZ G d d          ZdS )"    N)partial)Union)urljoin   )assembly_info)read_chromsizes
read_table)SCHEMAS)fetch_chromsizesfetch_centromeres
UCSCClientlocalFTz^chr[0-9]+$z	^chr[XY]$z^chrM$)provideras_bedfilter_chromschrom_patternsnatsortdbr   r   r   r   r   returnc                    |dk    r9t          |           }|r!|j        g d                                         S |j        S |dk    r t	          |           j        d||||d|S t          d| d          )	a  
    Fetch chromsizes from local storage or the UCSC database.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of chromsizes. Either "local" for local storage or "ucsc".
    as_bed : bool, optional
        If True, return chromsizes as an interval DataFrame (chrom, start, end)
        instead of a Series.

    The remaining options only apply to provider="ucsc".

    filter_chroms : bool, optional
        Filter for chromosome names given in ``chrom_patterns``.
    chrom_patterns : sequence, optional
        Sequence of regular expressions to capture desired sequence names.
    natsort : bool, optional
        Sort each captured group of names in natural order. Default is True.
    **kwargs :
        Passed to :func:`pandas.read_csv`

    Returns
    -------
    Series of integer bp lengths indexed by sequence name or BED3 DataFrame.

    Notes
    -----
    For more fine-grained control over the chromsizes from local storage,
    use :func:`bioframe.assembly_info`.

    Examples
    --------
    >>> fetch_chromsizes("hg38")
    name
    chr1     248956422
    chr2     242193529
    chr3     198295559
    ...      ...
    chrX     156040895
    chrY      57227415
    chrM         16569
    Name: length, dtype: int64

    >>> fetch_chromsizes("hg38", as_bed=True)
            chrom      start        end
    0        chr1          0  248956422
    1        chr2          0  242193529
    2        chr3          0  198295559
    ...      ...
    21       chrX          0  156040895
    22       chrY          0   57227415
    23       chrM          0      16569

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    r   )chromstartenducscr   r   r   r   Unknown provider ''N )r   	viewframecopy
chromsizesr   r   
ValueError)r   r   r   r   r   r   kwargsassemblys           R/var/www/html/software/conda/lib/python3.11/site-packages/bioframe/io/resources.pyr   r      s    N 7 $$ 	'%&?&?&?@EEGGG&&	V		.z"~~. 
')	
 

 
 
 	
 9h999:::    gieStaincybband_colc           	         | | |         dk             } |                      dd          }g }|D ]\  }}t          |          dk    s"t          d| dt          |                     |                    d          }|                    ||j        d	         d         |j        d
         d         |j        d	         d         d           t          j                            |          S )a  
    Extract chromosomal origin positions separating chromosome arms from
    cytological band data. Takes the cytological origin, i.e. the boundary
    between the two bands labeled 'acen'.

    Parameters
    ----------
    cyb : pandas.DataFrame
        DataFrame with cytoband data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    acenr   F)sort   zExpected 2 'acen' bands for z, found r   r   r   r   r   r   r   mid)	groupbylenr#   sort_valuesappendilocpd	DataFramefrom_records)r)   r*   groupedcensr   groupacenss          r&   _origins_from_cytobandr=   m   s    $ c(mv%
&Ckk'k..GD 
 
u5zzQWEWW3u::WWXXX!!'**Aw/z!}U+z!}U+	 	
 	
 	
 	
 <$$T***r'   r:   c                 8   |                      d                              t          j        t          j        d                                          } | d         | d         z   dz  | d<   | g d                             d                              d	          } | S )
a  
    Extract chromosomal origin positions from UCSC centromeres.txt table
    describing centromere model sequences. Takes the midpoint of all
    modeled centromere sequences.

    Parameters
    ----------
    cens : pandas.DataFrame
        DataFrame with centromeres.txt data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    r   )r   r   r   r   r.   r0   r/   T)drop)r1   aggnpminmaxreset_indexr3   )r:   s    r&   _origins_from_ucsccentromeresrE      s      <<  $$rvbf%E%EFFRRTTD=4;.14DK---.	W			$		 	
 Kr'   c                    |dk    r<t          |           }|j        }|t          d| d          t          |d          S |dk    rt	          |           }d|j        fdt          |j        d	
          fd|j        fg}|D ]+\  }}	  |            } n-# t          j	        j
        $ r Y (w xY wt          d| d          |dk    rt          |          S t          |          S t          d| d          )a  
    Extract centromere locations for a given assembly 'db' from a variety
    of file formats in UCSC (cytoband, centromeres) depending on
    availability, returning a DataFrame.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of centromere data. Either "local" for local storage
        or "ucsc".

    Returns
    -------
    DataFrame with centromere 'chrom', 'start', 'end', 'mid'.

    Notes
    -----
    When provider="local", centromeres are derived from cytoband tables
    in local storage.

    Whe provider="ucsc", the fallback priority goes as follows:
    - UCSC cytoBand
    - UCSC cytoBandIdeo
    - UCSC centromeres.txt

    Note that UCSC "gap" files no longer provide centromere information.

    Currently only works for human assemblies.

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    r   Nz3No source for centromere data found from provider 'z'.stain)r*   r   cytobandT)ideocentromeresr   r   )r   	cytobandsr#   r=   r   fetch_cytobandr   r   urlliberror	HTTPErrorrE   )	r   r   r%   r)   clientfetchersschemafetcherdfs	            r&   r   r      s`   J 7 $$ ;RhRRR   &cG<<<<	V		B./!6TBBBCF45
  ( 		 		OFGWYY<)    RhRRR   ]""0444)"--- 9h999:::s   	
BB,+B,c                       e Zd ZdZdefdZ	 	 	 	 ddeded	ed
edee	j
        e	j        f         f
dZde	j        fdZd Zddede	j        fdZde	j        fdZdS )r   z https://hgdownload.soe.ucsc.edu/r   c                 P    || _         t          | j        d| d          | _        d S )NzgoldenPath//)_dbr   BASE_URL_db_url)selfr   s     r&   __init__zUCSCClient.__init__   s,    t}.AB.A.A.ABBr'   Tr   Fr   r   r   r   r   c                 b    t          | j        d| j         d          }t          |f||||d|S )NzbigZips/z.chrom.sizesr   )r   rZ   rX   r   )r[   r   r   r   r   r$   urls          r&   r   zUCSCClient.fetch_chromsizes   sX     dl$Etx$E$E$EFF
')
 
 
 
 	
r'   c                 J    t          | j        d          }t          |fddi|S )Nzdatabase/centromeres.txt.gzrR   rJ   r   rZ   r	   r[   r$   r^   s      r&   r   zUCSCClient.fetch_centromeres  s/    dl$ABB#>>m>v>>>r'   c                 P    t          | j        d          }t          |fdg dd|S )Nzdatabase/gap.txt.gzgap)r   r   r   lengthtypebridge)rR   usecolsr`   ra   s      r&   
fetch_gapszUCSCClient.fetch_gaps  sI    dl$9::
III
 
 	
 
 	
r'   rI   c                 ~    |rt          | j        d          }nt          | j        d          }t          |d          S )Nzdatabase/cytoBandIdeo.txt.gzzdatabase/cytoBand.txt.gzrH   )rR   r`   )r[   rI   r$   r^   s       r&   rL   zUCSCClient.fetch_cytoband  sD     	D$,(FGGCC$,(BCCC#j1111r'   c                 `    t          | j        d          }t          |fdt          d         i|S )Nzdatabase/all_mrna.txt.gzrR   all_mrna)r   rZ   r	   r
   ra   s      r&   
fetch_mrnazUCSCClient.fetch_mrna"  sE    dl$>??
 
:&
 
 
 	
r'   N)Tr   TF)F)__name__
__module____qualname__rY   strr\   booltupler   r6   Seriesr7   r   r   rh   rL   rl   r   r'   r&   r   r      s       1HC3 C C C C # I
 

 
 	

 
 
ry",&	'
 
 
 
$?R\ ? ? ? ?
 
 
2 24 2bl 2 2 2 2
bl 
 
 
 
 
 
r'   r   )r(   )r   )rM   	functoolsr   typingr   urllib.parser   numpyrA   pandasr6   r%   r   fileopsr   r	   schemasr
   __all__rp   rq   rr   rs   r7   r   r=   rE   r   r   r   r'   r&   <module>r|      s                                    # # # # # # 0 0 0 0 0 0 0 0         EV; V; V;V; V; 	V;
 V; V; V; 29bl"#V; V; V; V;t (2!+ !+	!+!$!+\!+ !+ !+ !+H     4G; G;# G; G;2< G; G; G; G;T3
 3
 3
 3
 3
 3
 3
 3
 3
 3
r'   