
    tfkc                       d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlZd dlZd dlZd dlmZmZ ddlmZmZ d*d	Zd+d
Zd,dZd,dZd-dZ	 d.	 	 	 	 	 d/dZ ej@                  dejB                        fd0dZ"d1dZ#d2dZ$	 	 d3	 	 	 	 	 	 	 d4dZ%d5dZ&d6dZ'd7dZ(e(Z)d8dZ*d9dZ+d:dZ,	 	 	 	 	 	 d;dZ-d<dZ.	 d.	 	 	 	 	 d=dZ/d>dZ0d.d?dZ1e	 d@	 	 	 	 	 dAd       Z2 G d  d!ejf                        Z4dBd"Z5d.d#Z6dddejn                  fd$Z8dCd%Z9	 d.	 	 	 	 	 	 	 	 	 dDd'Z: G d( d&      Z;	 dE	 	 	 	 	 dFd)Z<y)G    )annotationsN)OrderedDictdefaultdict)contextmanager)IOAnyContextManagerIterableIterator)
is_integer	is_scalar   )GenomicRangeSpecifierGenomicRangeTuplec                4    fdt        |       D        S )a  Partition an integer interval into equally-sized subintervals.
    Like builtin :py:func:`range`, but yields pairs of end points.

    Examples
    --------
    >>> for lo, hi in partition(0, 9, 2):
           print(lo, hi)
    0 2
    2 4
    4 6
    6 8
    8 9

    c              3  @   K   | ]  }|t        |z         f  y wN)min).0istepstops     U/var/www/html/software/conda/envs/higlass/lib/python3.12/site-packages/cooler/util.py	<genexpr>zpartition.<locals>.<genexpr>    s!     GQAHd#$Gs   )range)startr   r   s    ``r   	partitionr      s     HeE4.FGG    c                    | j                  d      }t        |      dk(  r|d   d}}||fS t        |      dk(  r|\  }}|j                  d      sd|z   }||fS t        d      )zW
    Parse a Cooler URI string

    e.g. /path/to/mycoolers.cool::/path/to/cooler

    z::r   r   /   zInvalid Cooler URI string)splitlen
startswith
ValueError)sparts	file_path
group_paths       r   parse_cooler_urir*   #   s     GGDME
5zQ %a#:	 j   
Uq %	:$$S)z)J j   455r   c                8    t        | j                  dd            S )N, )intreplace)r&   s    r   atoir0   6   s    qyyb!""r   c                   t        j                  d      }|j                  | j                  dd            \  }}}t	        |      st        |      S t        |      }|j                         j                         }|dv r|dz  }t        |      S |dv r|dz  }t        |      S |dv r|d	z  }t        |      S t        d
| d      )Nz
([0-9,.]+)r,   r-   )KKBi  )MMBi@B )GGBi ʚ;zUnknown unit '')
recompiler"   r/   r#   r.   floatupperstripr%   )r&   _NUMERIC_RE_valueunits        r   parse_humanizedrB   :   s    **\*K &&qyyb'9:NAudt95z%LE::<D{ u: 
	
 u:	 
	 u: >$q122r   c                    d }d fd}| j                  d      }|d   j                         }t        |      st        d      t        |      dk  r|ddfS  | ||d	               \  }}|||fS )
ac  
    Parse a UCSC-style genomic region string into a triple.

    Parameters
    ----------
    s : str
        UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA
        style sequence names are allowed. End coordinate must be greater than
        or equal to start.

    Returns
    -------
    (str, int or None, int or None)

    c           
   3  2  K   g d}dj                  |D cg c]  }d|d    d|d    d c}      }t        j                  d| t        j                        }|j	                  |       D ]#  }|j
                  }||j                  |      f % y c c}w w)	N))HYPHEN-)COORDz[0-9,]+(\.[0-9]*)?(?:[a-z]+)?)OTHERz.+z|\s*z(?P<r   >r   )z\s*)joinr9   r:   
IGNORECASEfinditer	lastgroupgroup)r&   
token_specpairpattern	tok_regexmatchtyps          r   	_tokenizez&parse_region_string.<locals>._tokenize^   s     


 ,,T4Qy$q'! <TUJJ#gY/?	''* 	(E//Cu{{3'''	(  Us   BBA+Bc                    | )t        dj                  dj                  |                  | |vrt        d| d      y )NzExpected {} token missingz or zUnexpected token "")r%   formatrK   )rU   tokenexpecteds      r   _check_tokenz)parse_region_string.<locals>._check_tokenj   sI    ;8??H@UVWW(" #5eWA!>?? #r   c                    t        | d      \  }} ||dg       t        |      }t        | d      \  }} ||dg       t        | d      \  }}||d fS  ||dg       t        |      }||k  rt        d      ||fS )N)NNrG   rE   zEnd coordinate less than start)nextrB   r%   )tokensrU   rZ   r   endr\   s        r   _expectz$parse_region_string.<locals>._expectq   s    &,/
US%'+&&,/
US%(,&,/
U;$;S%'+e$;=>>czr   :r   zChromosome name cannot be emptyr!   Nr   )r"   r=   r#   r%   )r&   rV   ra   r'   chromr   r`   r\   s          @r   parse_region_stringrd   M   s    "
(@& GGCLE!HNNEu::;;
5zA~tT""58,-JE35#r   c                r   t        | t              rt        |       \  }}}n$| \  }}}|t        |      n|}|t        |      n|}	 |||   nd}|dn|}||t        d      |}||k  rt        d      |dk  s|||kD  rt        d| d| d      |||fS # t        $ r}t        d|       |d}~ww xY w)	aU  
    Genomic regions are represented as half-open intervals (0-based starts,
    1-based ends) along the length coordinate of a contig/scaffold/chromosome.

    Parameters
    ----------
    reg : str or tuple
        UCSC-style genomic region string, or
        Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``.
    chromsizes : mapping, optional
        Lookup table of scaffold lengths to check against ``chrom`` and the
        ``end`` coordinate. Required if ``end`` is not supplied.

    Returns
    -------
    A well-formed genomic region triple (str, int, int)

    NzUnknown sequence label: r   z Cannot determine end coordinate.zEnd cannot be less than startzGenomic region out of bounds: [z, rJ   )
isinstancestrrd   r.   KeyErrorr%   )reg
chromsizesrc   r   r`   clenes          r   parse_regionrm      s    , #s/4ucuc#/E
U/c#hsD$.$:z%  AEE
{<?@@
U{899qyT%#*:5'C5JKK%  D3E7;<!CDs   	B 	B6"B11B6z(\d+)c                    t        |j                  |       D cg c]"  }|s|j                         rt        |      n|$ c}      S c c}w r   )tupler"   isdigitr.   )r&   	_NS_REGEXxs      r   natsort_keyrs      s7    	8JP1aAIIK#a&Q.PQQPs
   AAc                $    t        | t              S )N)key)sortedrs   )iterables    r   	natsortedrx      s    (,,r   c                    t        j                  |       } t        |       st        j                  g t              S t        t        d | D               }t        j                  |d d d         S )Ndtypec              3  2   K   | ]  }t        |        y wr   )rs   )r   rr   s     r   r   zargnatsort.<locals>.<genexpr>   s     5!{1~5s   )npasarrayr#   arrayr.   ro   ziplexsort)r   colss     r   
argnatsortr      sT    JJuEu:xx#&&5u567D::d4R4j!!r   c                   t        | t              r#| j                  d      r|j                  dd       t	        j
                  | fdddgddgdt        id	|}|smg }|D ]O  }||d   j                  j                  |         }|j                  t        |d            }|j                  |       Q t	        j                  |d
      }|d   j                  |_        |d   S )at  
    Parse a ``<db>.chrom.sizes`` or ``<db>.chromInfo.txt`` file from the UCSC
    database, where ``db`` is a genome assembly name.

    Parameters
    ----------
    filepath_or : str or file-like
        Path or url to text file, or buffer.
    name_patterns : sequence, optional
        Sequence of regular expressions to capture desired sequence names.
        Each corresponding set of records will be sorted in natural order.
    all_names : bool, optional
        Whether to return all contigs listed in the file. Default is
        ``False``.

    Returns
    -------
    :py:class:`pandas.Series`
        Series of integer bp lengths indexed by sequence name.

    References
    ----------
    * `UCSC assembly terminology <http://genome.ucsc.edu/FAQ/FAQdownloads.html#download9>`_
    * `GRC assembly terminology <https://www.ncbi.nlm.nih.gov/grc/help/definitions>`_

    z.gzcompressiongzip	r   r   namelength)sepusecolsnamesr{   axis)rf   rg   endswith
setdefaultpdread_csvcontainsilocr   appendconcatvaluesindex)filepath_orname_patterns	all_nameskwargs
chromtabler'   rR   parts           r   read_chromsizesr      s    @ +s#(<(<U(C-0Ax sm J $ 	Gj044==gFGD99ZV56DLL	 YYu1-
!&)00Jhr   c                "    t        d|  dfi |S )zo
    Download chromosome sizes from UCSC as a :py:class:`pandas.Series`, indexed
    by chromosome label.

    z*http://hgdownload.soe.ucsc.edu/goldenPath/z/database/chromInfo.txt.gz)r   )dbr   s     r   fetch_chromsizesr     s&     
4RD8RS
 r   c                   ddl }t        |      dk(  rt        d      t        |      dk(  r|j                  |d   d      n5i |D ].  }j	                  |j                  |d      j
                         0 t        fd| D              }|S )az  
    Load lazy FASTA records from one or multiple files without reading them
    into memory.

    Parameters
    ----------
    names : sequence of str
        Names of sequence records in FASTA file or files.
    filepaths : str
        Paths to one or more FASTA files to gather records from.

    Returns
    -------
    OrderedDict of sequence name -> sequence record

    r   NzNeed at least one filer   T)as_rawc              3  ,   K   | ]  }||   f  y wr    )r   rc   fas     r   r   zload_fasta.<locals>.<genexpr>.  s     @5"U),@s   )pyfaidxr#   r%   Fastaupdaterecordsr   )r   	filepathsr   filepathr   r   s        @r   
load_fastar     s    " 
9~122
9~]]9Q<]5 ! 	DHIIgmmHTm:BBC	D @%@@GNr   c                      fd}t        j                  t        | j                               dd      }t        j                  |d   t         j                        d      |d<   |S )af  
    Divide a genome into evenly sized bins.

    Parameters
    ----------
    chromsizes : Series
        pandas Series indexed by chromosome name with chromosome lengths in bp.
    binsize : int
        size of bins in bp

    Returns
    -------
    bins : :py:class:`pandas.DataFrame`
        Dataframe with columns: ``chrom``, ``start``, ``end``.

    c                    |    }t        t        j                  |z              }t        j                  d|dz         z  }||d<   t	        j
                  | g|z  |d d |dd  dg d      S Nr   r   r}   )rc   r   r`   columns)r.   r~   ceilaranger   	DataFrame)rc   rk   n_binsbinedgesbinsizerj   s       r   _eachzbinnify.<locals>._eachD  sy    % RWWTG^,-99Q!-7||g&#2xPQPR|T-
 	
r   r   Tr   ignore_indexrc   
categoriesordered)r   r   mapkeysCategoricallistr   )rj   r   r   bintables   ``  r   binnifyr   2  s[    $
 yyUJOO$56QTRHd:+;+;&<dHW Or   c                4    	 ddl m} ddlm  j                         }	 t        ||      j                   fd}t        j                  t        ||      dd      S # t        $ r t	        d      dw xY w# t        $ r}t        d|       |d}~ww xY w)av  
    Divide a genome into restriction fragments.

    Parameters
    ----------
    fasta_records : OrderedDict
        Dictionary of chromosome names to sequence records.
    enzyme: str
        Name of restriction enzyme (e.g., 'DpnII').

    Returns
    -------
    frags : :py:class:`pandas.DataFrame`
        Dataframe with columns: ``chrom``, ``start``, ``end``.

    r   Nz4Biopython is required to find restriction fragments.zUnknown enzyme name: c                b   j                  t        |    d d              }t        j                  dt        j                   |            dz   t        |      f   j                  t        j                        }t        |      dz
  }t        j                  | g|z  |d d |dd  dg d      }|S r   )
Seqrg   r~   r_r   r#   astypeint64r   r   )rc   seqcutsn_fragsfragsbioseq
cut_finderfasta_recordss        r   r   zdigest.<locals>._eachz  s    jj]51!456uuQC1A5s3x?@GGQd)a-g'$s)DHM-
 r   Tr   )Bio.RestrictionRestrictionBio.Seqr   ImportErrorr   getattrsearchAttributeErrorr%   r   r   r   )r   enzymebiorstchromsrl   r   r   r   s   `     @@r   digestr   Z  s    "(  !FBVV,33
	 99S'adCC/  B
	  B09:ABs"   A! A: !A7:	BBBc                   t               }| j                  dd      D ]J  \  }}|j                  |d   |d   z
  j                  dd j	                                t        |      dkD  sJ y t        |      dk(  rt        t        |            S y)	z
    Infer bin size from a bin DataFrame. Assumes that the last bin of each
    contig is allowed to differ in size from the rest.

    Returns
    -------
    int or None if bins are non-uniform

    rc   T)observedr`   r   Nr}   r   )setgroupbyr   r   uniquer#   r^   iter)binssizes_chromrO   s       r   get_binsizer     s     EEg= eElU7^399#2>EEGHu:> 5zQDK  r   c                    | j                  dgd      ddg   j                  d      j                  ddd	
      }t        |d         t        |d         }}t	        j
                  ||      S )z
    Infer chromsizes Series from a bin DataFrame. Assumes that the last bin of
    each contig is allowed to differ in size from the rest.

    Returns
    -------
    int or None if bins are non-uniform

    rc   last)keepr`   T)dropr   r   )rc   r`   r   r   data)drop_duplicatesreset_indexrenamer   r   Series)r   r   r   lengthss       r   get_chromsizesr     su     	gYV4gu5EF	$		&:	; 
 :f-.Z5I0JGF99600r   c                   t        ||      \  }}}| j                  |      }|dkD  s|||   k  rU|d   j                  j                  |d      }||d   j                  |d j                  |d      z   }|j                  || }|S )zN
    Range query on a BED-like dataframe with non-overlapping intervals.

    r   r`   rightsider   Nleft)rm   	get_groupr   searchsortedr   )	groupedrj   regionrc   r   r`   resultlohis	            r   bedslicer    s     %VZ8E5#u%FqyC*U++E]!!..u7.C&/((-::3V:LLR#Mr   c                d    t        | t        j                        r| S t        j                  |       S r   )rf   h5pyDatasetr~   r   )rr   s    r   asarray_or_datasetr    s"    1dll+1>A>r   c                   t         j                  }t        |       } t        |       }|dk(  rVt        j                  g t
              t        j                  g t
              t        j                  g | j                        fS ||}g g }}t         j                  }t        d||      D ]h  }| |||z    } ||dd |dd k7        dz   }	|d   |k7  rt         j                  d|	f   }	|j                  ||	z          |j                  ||	          |d   }j t        j                  |      }t        j                  t         j                  ||f         }
t        j                  |      }||
|fS )aS  
    Run length encoding.
    Based on http://stackoverflow.com/a/32681075, which is based on the rle
    function from R.

    Parameters
    ----------
    x : 1D array_like
        Input array to encode
    dropna: bool, optional
        Drop all runs of NaNs.

    Returns
    -------
    start positions, run lengths, run values

    r   rz   Nr   r}   )r~   flatnonzeror  r#   r   r.   r{   nanr   r   r   concatenatediff)r   	chunksizewherenstartsr   last_valr   rr   locsr   s              r   rlencoder    sQ   * NNEu%EE
AAvHHRs#HHRs#HHRu{{+
 	
 	FFvvH1a# !a)m$QqrUaf_%)Q4855D>Da$hagR5 ^^F#FggbeeFAI&'G^^F#F7F""r   c                     t         fdt        j                  d   j                  t        j                        D              S )Nc              3     K   | ]F  }t        j                  t         j                  j                  |      t         j                         H y wr   )osaccesspathrK   X_OK)r   r  cmds     r   r   zcmd_exists.<locals>.<genexpr>  s7       			"'',,tS)2773s   AAPATH)anyr  environr"   pathsep)r  s   `r   
cmd_existsr    s5     JJv&,,RZZ8  r   c           	         t        j                  t        j                  | t        j                  | |      z
        |      S r   )r~   medianabs)r   r   s     r   madr!     s,    99RVVD299T4#8894@@r   c              /    K   t        | t              rd}t        j                  | |g|i |}njd}|dk(  r| j                  j
                  dk(  rnG|dv r$| j                  j
                  dk(  rt        d      |dk(  rt        d      |d	v rt        d
      | }	 | |r|j                          yy# |r|j                          w w xY ww)a  
    Context manager like ``h5py.File`` but accepts already open HDF5 file
    handles which do not get closed on teardown.

    Parameters
    ----------
    fp : str or ``h5py.File`` object
        If an open file object is provided, it passes through unchanged,
        provided that the requested mode is compatible.
        If a filepath is passed, the context manager will close the file on
        tear down.

    mode : str
        * r        Readonly, file must exist
        * r+       Read/write, file must exist
        * a        Read/write if exists, create otherwise
        * w        Truncate if exists, create otherwise
        * w- or x  Fail if exists, create otherwise

    TFrr+)r$  az%File object provided is not writeablewzCannot truncate open file)zw-rr   zFile existsN)rf   rg   r  Filefilemoder%   close)fpr)  argsr   own_fhfhs         r   	open_hdf5r/    s     6 "cYYr41$1&13;277<<4/[ RWW\\S%8DEES[899[ ]++HHJ 6HHJ s   BCB3 C3CCc                  8     e Zd Zd fdZddZddZddZ xZS )closing_hdf5c                8    t         |   |j                         y r   )super__init__id)selfgrp	__class__s     r   r4  zclosing_hdf5.__init__6  s     r   c                    | S r   r   r6  s    r   	__enter__zclosing_hdf5.__enter__9  s    r   c                6    | j                   j                         S r   r(  r*  )r6  exc_infos     r   __exit__zclosing_hdf5.__exit__<  s    yy  r   c                8    | j                   j                          y r   r=  r:  s    r   r*  zclosing_hdf5.close?  s    		r   )r7  
h5py.Group)returnrA  )rB  None)__name__
__module____qualname__r4  r;  r?  r*  __classcell__)r8  s   @r   r1  r1  5  s    !!r   r1  c                    t        |       }| j                         D ]  \  }}	 |j                         ||<    |S # t        $ r |j	                         ||<   Y <t
        $ r |||<   Y Kw xY wr   )dictitemsitemr%   tolistr   )attrsoutkvs       r   attrs_to_jsonablerQ  C  sp    
u+C 1	VVXCF J	  	 XXZCF 	CF	s   :A(A('A(c                  	 t        j                  d      t        j                  d      t        j                  d      t        j                  d      t        j
                  d      t        j
                  d      t        j                  d      dd	d	fdfd	}dfd
	}t        | d      r| j                  S t        | t        j                  t        j                  f      r| j                  dd S t        | t        j                        r| dd S ||n|dd }t        | t              rBt        j                  | j!                         D ci c]  \  }}| ||||       c}}|      S t        | t"              rt%        |       dk(  r || d   | d   |      S t        | t&        t"        f      rht)        d | D              st+        d|        t        j                  | D ci c]  \  }}| ||||       c}}| D cg c]  \  }}|	 c}}|      S t        | d      s | 	 t        j,                  |       } |      S t/        |       r ||       S t1        d|        c c}}w c c}}w c c}}w #  Y 7xY w)a  
    Extracted and modified from dask/dataframe/utils.py :
        make_meta (BSD licensed)

    Create an empty pandas object containing the desired metadata.

    Parameters
    ----------
    x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar
        To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or
        an iterable of `(name, dtype)` tuples. To create a `Series`, provide a
        tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index
        should match the desired output. If a dtype or scalar, a scalar of the
        same dtype is returned.
    index :  pd.Index, optional
        Any pandas index to use in the metadata. If none provided, a
        `RangeIndex` will be used.

    Examples
    --------
    >>> make_meta([('a', 'i8'), ('b', 'O')])
    Empty DataFrame
    Columns: [a, b]
    Index: []
    >>> make_meta(('a', 'f8'))
    Series([], Name: a, dtype: float64)
    >>> make_meta('i8')
    1

    T    z
1970-01-01r   foo)bVr4   mSr%  UO__UNKNOWN_CATEGORIES__c                .   | j                   dv r| j                  d      S | j                   dk(  r| j                  t        dd            S | j                   v r0| j                      }| j                   dv r|j                  |       S |S t	        d|        )N)r   fur   cr   )rW  r4   zCan't handle dtype: )kindtypecomplexr   	TypeError)r{   o_simple_fake_mappings     r   _scalar_from_dtypez&infer_meta.<locals>._scalar_from_dtype|  s    ::(::a= ZZ3::gam,,ZZ//$UZZ0A&+jjJ&>188E?EAE25':;;r   c                d   t        | t        j                  t        j                  t        j                  f      r| S t        j                  |       r>t        | d      r| j                  nt        j                  t        |             } |      S t        dt        |       j                   d      )Nr{   zCan't handle meta of type 'r8   )rf   r   	Timestamp	TimedeltaPeriodr~   isscalarhasattrr{   ra  rc  rD  )rr   r{   rf  s     r   _nonempty_scalarz$infer_meta.<locals>._nonempty_scalar  s|    a",,bii@AH[[^&q'2AGGa8IE%e,,<T!W=M=M<NaPQQr   c                    t        |t              r>|dk(  r9t        j                  t        j                  g      | |      j
                  d d S t        j                  g || |      S )Ncategory)r   r   r   )r{   r   r   )rf   rg   r   r   r   r   )r   r{   r   UNKNOWN_CATEGORIESs      r   _empty_seriesz!infer_meta.<locals>._empty_series  s^    eS!ez&999 2344ud2A  yy5t5AAr   _metar   )r   r!   c              3  \   K   | ]$  }t        |t              xr t        |      d k(   & yw)r!   N)rf   ro   r#   )r   r   s     r   r   zinfer_meta.<locals>.<genexpr>  s'     CA:a'7CFaK7Cs   *,z2Expected iterable of tuples of (name, dtype), got r   r   r{   z'Don't know how to create metadata from r   )r~   bool_void
datetime64timedelta64str_unicode_rl  rr  rf   r   r   r   r   IndexrI  rJ  ro   r#   r   allr%   r{   r   rc  )
rr   r   rm  rq  r_  dr{   rp  rf  re  s
          @@@r   
infer_metar~  O  s9   B XXd^WWT]]]<(^^AWWU^WWU^[[	 2	<RB q'ww!bii./vva{	Arxx	 1v]Ea
E!T||?@wwyIVaQa%00IQV
 	
 !UA!QqT1Q4u55	Ae}	%CCCGsK  ||?@AVaQa%00A#$%41aQ%
 	

 Q Q]	HHQKE%e,,
 |""
=aSA
BB9 J B%	s   I+
*I1
I7-I= =Jc           	        t        |       } t        t              st        fd      nMj	                         }t        fd      |j                         D ]  \  }}t        |      r| |   n|}||<    ||du rt        j                  g       }	n|D 
cg c]  }
t        j                  g |
          }}
t        |      dk(  rt        j                  |d   |d         }	n!t        j                  j                  ||      }	|j                          t        |      D ]  \  }}| j                  ||z
          | D ci c]  }|t        j                  g |          }}t        j                   || |		      S c c}
w c c}w )
ze
    Extracted and modified from pandas/io/parsers.py :
        _get_empty_meta (BSD licensed).

    c                     xs  S r   r   )default_dtyper{   s   r   <lambda>zget_meta.<locals>.<lambda>  s    E$:] r   c                      S r   r   )r  s   r   r  zget_meta.<locals>.<lambda>  s    M r   Frz   r   r   )r   )r   rt  )r   rf   rI  r   copyrJ  r   r   r{  r   r#   
MultiIndexfrom_arrayssort	enumeratepopr   )r   r{   index_columnsindex_namesr  _dtyperO  rP  colr   r   r   r   r  col_namecol_dicts    `  `           r   get_metar    sj    7mG
 eT":; 12 LLN 	DAq *1'!*1CE#J	  6=HIT		"E$K0IIt9>HHT!W;q>:EMM--d+-FEm, 	DAqKKA	 PWW8"))BeHo>>WHW<<'?? J Xs   !F"Fc                X   t        | d   j                  t        j                        }| j	                         } |s:t        j
                  | j                  t        |j                        d      | d<   | S | d   j                  j                  |j                  k(  j                         sJ | S )Nrc   Tr   )rf   r{   r   CategoricalDtyper  r   rc   r   r   catr   r|  )r   rj   is_cats      r   
check_binsr    s    W++R-@-@AF99;DJJ4
(8(8#94
W K W!!,,
0@0@@EEGGGKr   GenomeSegmentationc           
        | j                   }|j                         }||}|j                         }||j                  |   z  }|j                  |   |z  }g }|D ]  \  }	|vr| j                     }
t        t        j                  ||j                     z              }|	j                  j                  d d |   }|d   |
k7  rt        j                  ||
f   }|j                  fdt        |d d |dd        D                |S )Nr}   c              3  .   K   | ]  \  }}||f  y wr   r   )r   r   r`   rc   s      r   r   z%balanced_partition.<locals>.<genexpr>  s!      
$.E3UE3
s   r   )_bins_groupedsizeidxmaxlocrj   r.   r~   r   r   r   r   extendr   )gsn_chunk_maxfile_contigsloadingsr   chrom_nbinschrmaxconstgrangesrO   rk   r   anchorsrc   s                @r   balanced_partitionr    s    G,,.K__F(,,v..HOOF#k1EG 

u$}}U#27758<<#6678++$$VtV,2;$eeGTM*G 
25gcrlGABK2P
 	


 Nr   c                      e Zd ZddZddZy)r  c                   t        ||      }|j                  ddd      | _        | j                  j                         j                  }|| _        t        |      | _        t        |j                               | _
        || _        t        j                  |j                         t        t        |                  | _        t"        j$                  dt#        j&                  |      f   | _        t"        j$                  dt#        j&                  |j                        f   | _        | j*                  |d   j,                  j.                     |d   j                  z   | _        y )Nrc   TF)r   r  r   r   r   )r  r   r  r  r   rj   r   r   r   r   contigsr   r   r   r   r#   idmapr~   r   cumsumchrom_binoffsetchrom_absposr  codesstart_abspos)r6  rj   r   nbins_per_chroms       r   r4  zGenomeSegmentation.__init__  s   $
+!\\'Du\M,,113::$"4(JOO-.	YYZ__%6U3z?=ST
!uuQ		/(B%BCEE!RYYz/@/@%A"ABd7m//556g9M9MM 	r   c                H   t        || j                        \  }}}| j                  j                  |      }|dkD  s|| j                  |   k  rU|d   j                  j                  |d      }||d   j                  |d  j                  |d      z   }|j                  || }|S )Nr   r`   r   r   r   r   )rm   rj   r  r   r   r   r   )r6  r   rc   r   r`   r   r   r   s           r   fetchzGenomeSegmentation.fetch(  s    (Auc##--e419dooe44%%225w2GBfWo,,RS1>>s>PPB[[B'Fr   N)rj   	pd.Seriesr   pd.DataFrame)r   r   rB  r  )rD  rE  rF  r4  r  r   r   r   r  r    s    
 r   c              #     K   g }d}| D ]D  }|t        |      z  }|j                  |       ||kD  s(t        j                  |d       g }d}F t        |      rt        j                  |d       yyw)a  
    Take an incoming iterator of small data frame chunks and buffer them into
    an outgoing iterator of larger chunks.

    Parameters
    ----------
    chunks : iterator of :py:class:`pandas.DataFrame`
        Each chunk should have the same column names.
    size : int
        Minimum length of output chunks.

    Yields
    ------
    Larger outgoing :py:class:`pandas.DataFrame` chunks made from concatenating
    the incoming ones.

    r   r   N)r#   r   r   r   )chunksr  bufr  chunks        r   bufferedr  2  s{     * C	A 	SZ

5t8))Ca((CA 3xii!$$ s   .A6AA6)r   r.   r   r.   r   r.   rB  zIterator[tuple[int, int]])r&   rg   rB  ztuple[str, str])r&   rg   rB  r.   )r&   rg   rB  z"tuple[str, int | None, int | None]r   )ri   r   rj   zdict | pd.Series | NonerB  r   )r&   rg   rB  ro   )rw   Iterable[str]rB  	list[str])r   r  rB  
np.ndarray))z^chr[0-9]+$z	^chr[XY]$z^chrM$F)r   zstr | IO[str]r   ztuple[str, ...]r   boolrB  r  )r   rg   rB  r  )r   r  r   rg   rB  OrderedDict[str, Any])rj   r  r   r.   rB  r  )r   r  r   rg   rB  r  )r   r  rB  
int | None)r   r  rB  r  )rj   zpd.Series | dictr   r   rB  r  )rr   r   rB  znp.ndarray | h5py.Dataset)r   r  r  r  rB  z)tuple[np.ndarray, np.ndarray, np.ndarray])r  rg   rB  r  )r   r  r   r  rB  r  )r#  )r+  zstr | h5py.Groupr)  rg   rB  zContextManager[h5py.Group])rM  zh5py.AttributeManagerrB  rI  )r   r  rj   r  rB  r  )
r  r  r  r.   r  r  r  zlist[int | float] | NonerB  zlist[GenomicRangeTuple])i )r  zIterable[pd.DataFrame]r  r.   rB  zIterator[pd.DataFrame])=
__future__r   r  r9   collectionsr   r   
contextlibr   typingr   r   r	   r
   r   r  numpyr~   pandasr   pandas.api.typesr   r   _typingr   r   r   r*   r0   rB   rd   rm   r:   rY  rs   rx   r   r   r   r   r   make_bintabler   r   r   r  r  r  r  r!  r/  Groupr1  rQ  r~  object_r  r  r  r  r  r   r   r   <module>r     s   " 	 	 0 % > >    2 =H$!&#&>F +/.	.'. .b #-"**Xrtt"< R-" &O2 2 "2  2 
 2 j	D"J +D\*1&  " 	$? !0#0#0# /0#fA  --
-
  - -`4:: 	nCd tRZZ(@V
" *.	  '	
 > : %"%
% %r   