
    DUfK                     f   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlZd dlZ	 d dlZn# e$ r dZY nw xY w	 d dlZn# e$ r dZY nw xY wddlmZ ddlmZ ddlmZmZ g d	ZddZ	 	 	 	 ddZddZ	 	 	 	 	 d dZddZddZ G d d          Z d!dZ!d"dZ"d"dZ#d#dZ$d$dZ%dS )%    N)OrderedDict)closing   )
argnatsort)parse_region   )
BAM_FIELDSSCHEMAS)

read_tableread_chromsizes
read_tabixread_pairixread_bam
load_fastaread_bigwig	to_bigwigread_bigbed	to_bigbedFc                    |                     dd           |                     dd           |                     dd           t          | t                    r+|                     d          r|                     dd	           |x	 |                     d
t          |                    nU# t
          t          f$ rA t          |t                    rt          d| d          |                     d
|           Y nw xY wt          j	        | fi |}|rM|
                                                    d          dk                                    rt          d          |S )a;  
    Read a tab-delimited file into a data frame.

    Equivalent to :func:`pandas.read_table` but supports an additional
    `schema` argument to populate column names for common genomic formats.

    Parameters
    ----------
    filepath_or : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL
    schema : str
        Schema to use for table column names.
    schema_is_strict : bool
        Whether to check if columns are filled with NAs.

    Returns
    -------
    df : pandas.DataFrame of intervals

    sep	headerN	index_colF.gzcompressiongzipnameszTSV schema not found: ''r   axiszvone or more columns are all NA, check agreement between number of fields in schema and number of columns in input file)
setdefault
isinstancestrendswithr
   KeyError	TypeError
ValueErrorpdread_csvnotnasumany)filepath_orschemaschema_is_strictkwargsdfs        P/var/www/html/software/conda/lib/python3.11/site-packages/bioframe/io/fileops.pyr   r   *   s   * eT"""
h%%%
k5)))+s## 1(<(<U(C(C 1-000	/gwv7777)$ 	/ 	/ 	/&#&& F !D6!D!D!DEEEgv.....	/ 
[	+	+F	+	+B HHJJNNN""a',,.. 	9  
 Is   !B( (AC:9C:Tz^chr[0-9]+$z	^chr[XY]$z^chrM$c                    t          | t                    r+|                     d          r|                    dd           t	          j        | fdddgddgdt          id	|}|rg }|D ]o}t          |          s||d         j                            |                   }	|r |	j        t          |	d                            }	|
                    |	           pt	          j        |d
          }|r<d|d<   |g d                             dddd
                              d          }n|d         j        |_        |d         }|S )a[  
    Read a ``<db>.chrom.sizes`` or ``<db>.chromInfo.txt`` file from the UCSC
    database, where ``db`` is a genome assembly name, as a `pandas.Series`.

    Parameters
    ----------
    filepath_or : str or file-like
        Path or url to text file, or buffer.
    filter_chroms : bool, optional
        Filter for chromosome names given in ``chrom_patterns``.
    chrom_patterns : sequence, optional
        Sequence of regular expressions to capture desired sequence names.
    natsort : bool, optional
        Sort each captured group of names in natural order. Default is True.
    as_bed : bool, optional
        If True, return chromsizes as an interval dataframe (chrom, start, end).
    **kwargs :
        Passed to :func:`pandas.read_csv`

    Returns
    -------
    Series of integer bp lengths indexed by sequence name or an interval dataframe.

    Notes
    -----
    Mention name patterns

    See also
    --------
    * UCSC assembly terminology: <http://genome.ucsc.edu/FAQ/FAQdownloads.html#download9>
    * NCBI assembly terminology: <https://www.ncbi.nlm.nih.gov/grc/help/definitions>

    r   r   r   r   r   r   namelength)r   usecolsr   dtyper   start)r5   r9   r6   chromend)r5   r6   columnsT)drop)r"   r#   r$   r!   r(   r)   lencontainsilocr   appendconcatrenamereset_indexvaluesindex)
r-   filter_chromschrom_patternsnatsortas_bedr0   
chromtablepartspatternparts
             r2   r   r   V   s   R +s## 1(<(<U(C(C 1-000Ax sm   J  	.% 	 	Gw<< j04==gFFGD ;yDL!9!9:LLYu1---
 	*
72223VW66YVGG[d[## 	
 &f-4
)
    c                 R   ddl }t          |                    |                     5 }t          |j                  pd}t          j        t          j        d	                    |
                    |||                              dd|          }ddd           n# 1 swxY w Y   |S )z3
    Read a tabix-indexed file into dataFrame.
    r   N
r   )r   r   r   )pysamr   	TabixFilelistr   r(   r)   ioStringIOjoinfetch)fpr:   r9   r;   rR   fr   r1   s           r2   r   r      s     LLL	$$	%	% 
QX&$[K		!''%"<"<==>>	
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Is   A)BB #B c                    ddl }ddl}	|i }|	                    | d          }
|
                                }t	          |          r|                    d |          }d|v rQ|Od |d         D             }t	          |          r.|,t          d |D              \  }}t          j        ||          }d	|v r*(|d	         d         	                                d
d         t          ||          \  }}}|t          ||          \  }}}n|||}}}|
                    ||||||          }|fd|D             fd|D             }|n|}t          j                            |          }LD ]I}||v r%||                             ||                   ||<   +t          j        ||         d          ||<   J|S )z4
    Read a pairix-indexed file into DataFrame.
    r   Nrc                 8    |                      d          d         S )N:r   split)xs    r2   <lambda>zread_pairix.<locals>.<lambda>   s    Q rO   z
#chromsizec                 F    g | ]}|                                 d d         S )r   Nr_   ).0lines     r2   
<listcomp>zread_pairix.<locals>.<listcomp>   s)    NNN$TZZ\\!""%NNNrO   c              3   P   K   | ]!}|d          t          |d                   fV  "dS )r   r   N)int)rd   items     r2   	<genexpr>zread_pairix.<locals>.<genexpr>   s5      &Q&Q4QT!W'>&Q&Q&Q&Q&Q&QrO   )rF   dataz#columnsr   c                 :    g | ]}                     |          S  )rF   )rd   colr<   s     r2   rf   zread_pairix.<locals>.<listcomp>   s%    <<<SgmmC((<<<rO   c                 .    g | ]fd D             S )c              3   (   K   | ]}|         V  d S )Nrm   )rd   irecords     r2   rj   z)read_pairix.<locals>.<listcomp>.<genexpr>   s'      22!F1I222222rO   rm   )rd   rr   
argusecolss    @r2   rf   zread_pairix.<locals>.<listcomp>   s/    DDDv2222z222DDDrO   r<   ignore)cytoolzpypairixopen
get_headerr>   groupbyzipr(   Seriesr`   r   query2D	DataFramefrom_recordsastype
to_numeric)rY   region1region2
chromsizesr<   r7   dtypesr0   toolzrw   rZ   r   header_groupsitemsr   lengthschrom1start1end1chrom2start2end2itrecordsr1   rn   rs   s       `                     @r2   r   r      s!    OOO~b#A\\^^F
6{{ ?&?&?HH=((Z-?NN-2MNNNE5zz Bj0!$&Q&Q5&Q&Q&Q!RwYUAAA
&&7?#J/288::122>G'<<FFD+GZ@@%vt	
664	>	>B<<<<G<<<
DDDDDDD		"	"7G	"	<	<B 	; 	;Cf}}S'..553-3::3IrO   c                    ddl }t          j                            |           d         }|dk    rd}n$|dk    rd}n|dk    rd	}nt	          | d
          t          |                    | |                    5 }g }|                    |||          D ]}	d |	j        D             }
|	                    |	j
        |	j        |	j        |	j        |	j        |	j        dk    r|	j        nt           j        |	j        |	j        |	j        |	j        |	j        t/          j        t3          |
                    f           t5          j        |t8                    }ddd           n# 1 swxY w Y   |S )z2
    Read alignment records into a DataFrame.
    r   Nr   z.samr\   z.bamrbz.cramrcz is not a supported filetypec                 ~    g | ]:\  }}|t          |          t          j        k    r|                                n|f;S rm   )typearraytolist)rd   kvs      r2   rf   z"read_alignment.<locals>.<listcomp>   s?    XXXAQd1gg&<&<


!DXXXrO   rt   )rR   ospathsplitextr'   r   AlignmentFilerX   tagsrA   qnameflagreference_nameposmapqcigarstringnpnanrnextpnexttlenseqqualjsondumpsdictr(   r~   r	   )rY   r:   r9   r;   rR   extmoderZ   r   sr   r1   s               r2   read_alignmentr      s    LLL
'

2

q
!C
f}}		C===>>>	$$R..	/	/ 71s++ 	 	AXXQRQWXXXDNNGF$EF%&Vq[[AMMbfGGFEFJtDzz**     \':666+7 7 7 7 7 7 7 7 7 7 7 7 7 7 7, Is   :CEE!$E!c                 &    t          | |||          S )zU
    Deprecated: use `read_alignment` instead.
    Read bam file into dataframe,
    )r   )rY   r:   r9   r;   s       r2   r   r     s    
 "eUC000rO   c                       e Zd Zd Zd ZdS )PysamFastaRecordc                 `    || _         ||j        vrt          d| d| d          || _        d S )NzReference name 'z' not found in 'r   )ff
referencesr%   ref)selfr   r   s      r2   __init__zPysamFastaRecord.__init__  sC    bm##HcHH2HHHIIIrO   c                     t          |t                    r|j        |j        }}n|}|dz   }| j                            | j        ||          S )Nr   )r"   slicer9   stopr   rX   r   )r   keyr9   r   s       r2   __getitem__zPysamFastaRecord.__getitem__#  sK    c5!! 	)SX4EEE7Dw}}TXud333rO   N)__name__
__module____qualname__r   r   rm   rO   r2   r   r     s2          4 4 4 4 4rO   r   rR   c                    t          | t                     }t                      }|                                }|dk    r	 ddl}n# t
          $ r t          d          w xY w|r4| D ]0} |j        |fi |}|j        d         }t          ||          ||<   1n |j        | fi |}|j        D ]}t          ||          ||<   n|dk    r	 ddl	}	n# t
          $ r t          d          w xY w|rM| D ]I} |	j
        |fi |}t          t          |                                                    }||         ||<   Jn@ |	j
        | fi |}|                                D ]}||         ||<   nt          d          |S )a  
    Load lazy fasta sequences from an indexed fasta file (optionally compressed)
    or from a collection of uncompressed fasta files.

    Parameters
    ----------
    filepath_or : str or iterable
        If a string, a filepath to a single `.fa` or `.fa.gz` file. Assumed to
        be accompanied by a `.fai` index file. Depending on the engine, the
        index may be created on the fly, and some compression formats may not
        be supported. If not a string, an iterable of fasta file paths each
        assumed to contain a single sequence.
    engine : {'pysam', 'pyfaidx'}, optional
        Module to use for loading sequences.
    kwargs : optional
        Options to pass to ``pysam.FastaFile`` or ``pyfaidx.Fasta``.

    Returns
    -------
    OrderedDict of (lazy) fasta records.

    Notes
    -----
    * pysam/samtools can read .fai and .gzi indexed files, I think.
    * pyfaidx can handle uncompressed and bgzf compressed files.

    rR   r   Nz'pysam is required to use engine='pysam'pyfaidxz+pyfaidx is required to use engine='pyfaidx'z#engine must be 'pysam' or 'pyfaidx')r"   r#   r   lowerrR   ImportError	FastaFiler   r   r   Fastanextiterkeysr'   )
r-   enginer0   is_multifiler   rR   onefiler   r5   r   s
             r2   r   r   ,  s   8 "+s333LmmG\\^^F	ILLLL 	I 	I 	IGHHH	I  	;& ; ;$U_W7777}Q' 0T : :;
 !7777B ; ; 0T : :; 
9			MNNNN 	M 	M 	MKLLL	M  	)& ) )"W]755f55DOO,, "4)
 {55f55B		 ) ) "4) >???Ns    A AC C*autoc                 T   |                                 }|dk    r)t          t          t          d          t          d}nd}|dv rM|d}|d}t          j        |           5 }|                    |||	          }ddd           n# 1 swxY w Y   n|dk    r{t          j        |           }|d}||                                |         }|                    |||          }t          j	        |g d
          }|
                    dd|           nt          d|           |S )a~  
    Read intervals from a bigWig file.

    Parameters
    ----------
    path : str
        Path or URL to a bigWig file
    chrom : str
    start, end : int, optional
        Start and end coordinates. Defaults to 0 and chromosome length.
    engine : {"auto", "pybbi", "pybigwig"}
        Library to use for querying the bigWig file.

    Returns
    -------
    DataFrame

    r   Nz9read_bigwig requires either the pybbi or pyBigWig packagepybbipybigwigr   bbir   r9   r;   )r9   r;   valuert   r:   2engine must be 'auto', 'pybbi' or 'pybigwig'; got )r   r   pyBigWigr   rx   fetch_intervalschroms	intervalsr(   r~   insertr'   r   r:   r9   r;   r   rZ   r1   ivalss           r2   r   r   s  s   & \\^^F;8+K   _FFF!!!=E;CXd^^ 	@q""53"??B	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 
:		M$=E;((**U#CE5#..\%)B)B)BCCC
		!We$$$$ VfVVWWWI   $B		BBc                 T   |                                 }|dk    r)t          t          t          d          t          d}nd}|dv rM|d}|d}t          j        |           5 }|                    |||	          }ddd           n# 1 swxY w Y   n|dk    r{t          j        |           }|d}||                                |         }|                    |||          }t          j	        |g d
          }|
                    dd|           nt          d|           |S )a~  
    Read intervals from a bigBed file.

    Parameters
    ----------
    path : str
        Path or URL to a bigBed file
    chrom : str
    start, end : int, optional
        Start and end coordinates. Defaults to 0 and chromosome length.
    engine : {"auto", "pybbi", "pybigwig"}
        Library to use for querying the bigBed file.

    Returns
    -------
    DataFrame

    r   Nz9read_bigbed requires either the pybbi or pyBigWig packager   r   r   r   r   r   )r9   r;   restrt   r:   r   )r   r   r   r   rx   r   r   entriesr(   r~   r   r'   r   s           r2   r   r     s   & \\^^F;8+K   _FFF!!!=E;CXd^^ 	@q""53"??B	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 
:		M$=E;((**U#C		%,,\%)A)A)ABBB
		!We$$$$ VfVVWWWIr   c           
         |7d}	 t          j        |          J n# t          $ r t          d          w xY w|                    d          rTt
          j                            |          s2t          j        |t
          j	                  rt          d| d          |}nqt
          j        
                    |d          }t
          j                            |          s2t          j        |t
          j	                  rt          d| d          d}dD ]}|| j        vrd	}t          | j                  d
k     rd	}|st          d| j                   || j        d         }ddd|g}| |                                         }	|	d                             t                    |	d<   |	                    g d          }	t#          j        d          5 }
t#          j        dd          5 }|                    |dd	           |                                 |	                    |
j        d|d	d	d           t-          j        ||
j        |j        |gd          }ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |S )m  
    Save a bedGraph-like dataframe as a binary BigWig track.

    Parameters
    ----------
    df : pandas.DataFrame
        Data frame with columns 'chrom', 'start', 'end' and one or more value
        columns
    chromsizes : pandas.Series
        Series indexed by chromosome name mapping to their lengths in bp
    outpath : str
        The output BigWig file path
    value_field : str, optional
        Select the column label of the data frame to generate the track. Default
        is to use the fourth column.
    path_to_binary : str, optional
        Provide system path to the bedGraphToBigWig binary.

    NbedGraphToBigWigzbedGraphToBigWig is not present in the current environment. Pass it as 'path_to_binary' parameter to bioframe.to_bigwig or install it with, for example, conda install -y -c bioconda ucsc-bedgraphtobigwig zHbedGraphToBigWig is absent in the provided path or cannot be fexecuted: . zGbedGraphToBigWig is absent in the provided path or cannot be executed: Tr:   r9   r;   F   z+A bedGraph-like DataFrame is required, got    r:   r9   r;   z.bgsuffixwt.chrom.sizesr   r   r   r   r   r<   rF   r   na_repcapture_outputshutilwhich	Exceptionr'   r$   r   r   isfileaccessX_OKrW   r<   r>   copyr   r#   sort_valuestempfileNamedTemporaryFileto_csvflushr5   
subprocessrun)r1   r   outpathvalue_fieldpath_to_binarycmdis_bedgraphrn   r<   bgrZ   csps                r2   r   r     s]   *  	<$$0000 	 	 	)  	 
	 	 !3	4	4 w~~n-- 	")NBG2T2T 	1,1 1 1   gll>+=>>w~~c"" 	rybg'>'> 	0+0 0 0  
 K(    bj  K
2: USrzSSTTTjm4G	G				BW+$$S))BwK	111	2	2B		$E	2	2	2 
a		$T.	A	A	A
EG"$u555




		FgU5QV 	 	
 	
 	
 N!&"'7+
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Hs?    7J.A1I+J+I/	/J2I/	3JJ	Jbed6c           
         |7d}	 t          j        |          J n# t          $ r t          d          w xY w|                    d          rTt
          j                            |          s2t          j        |t
          j	                  rt          d| d          |}nqt
          j        
                    |d          }t
          j                            |          s2t          j        |t
          j	                  rt          d| d          d}dD ]}|| j        vrd	}t          | j                  d
k     rd	}|st          d| j                   g d}| |                                         }	|	d                             t                    |	d<   |	                    g d          }	t#          j        d          5 }
t#          j        dd          5 }|                    |dd	           |                                 |	                    |
j        d|d	d	d           t-          j        |d| |
j        |j        |gd          }ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |S )r   NbedToBigBedzbedToBigBed is not present in the current environment. Pass it as 'path_to_binary' parameter to bioframe.to_bigbed or install it with, for example, conda install -y -c bioconda ucsc-bedtobigbed zBbedToBigBed is absent in the provided path or cannot be executed: r   r   T)r:   r9   r;   r5   scorestrandF   z'A bed6-like DataFrame is required, got r:   r   z.bedr   r   r   r   r   r   r   z-type=r   r   )r1   r   r  r.   r  r  is_bed6rn   r<   bedrZ   r  r  s                r2   r   r   2  sY   * 	<$$0000 	 	 	$  	 
	 	 	/	/ w~~n-- 	")NBG2T2T 	0+0 0 0   gll>+=>>w~~c"" 	rybg'>'> 	0+0 0 0  
 GC  bj  G
2: QO2:OOPPPBBBG
W+



Cw<&&s++CL
//333
4
4C		$F	3	3	3 
q(:U^; ; ; 
	"$u555





FgU5QV 	 	
 	
 	
 N#6##QVRWg>
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Hs?    7I5A5II5I"	"I5%I"	&I55I9<I9)NF)Tr3   TF)NNN)NNNNN)rR   )NNr   )NN)r  N)&r   rU   r   r   r   r  r   collectionsr   
contextlibr   numpyr   pandasr(   r   r   r   core.arropsr   core.stringopsr   schemasr	   r
   __all__r   r   r   r   r   r   r   r   r   r   r   r   rm   rO   r2   <module>r     s    				  				       # # # # # #              JJJJ   
CCCOOOO   HHH % $ $ $ $ $ ) ) ) ) ) ) ( ( ( ( ( ( ( (  ) ) ) )\ <J J J JZ   ( 4 4 4 4n& & & &R1 1 1 14 4 4 4 4 4 4 4 D D D DN4 4 4 4n4 4 4 4nN N N NbL L L L L Ls!   7 A AA
 
AA