o
    Nrfs                     @  s  d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlZd dlZd dlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" g dZ#ee$Z%dZ&g dZ'dXddZ(G dd deZ)			dYdZd&d'Z*d[d+d,Z+d\d1d2Z,d]d6d7Z-d^d8d9Z.	:d_d`d?d@Z/	dadbdEdFZ0G dGdH dHeZ1				dcdddMdNZ2				dcdedRdSZ3	dadfdVdWZ4dS )g    )annotationsN)bisect_right)OrderedDictdefaultdict)AnyIteratorLiteral   )
get_logger)
MapFunctor)__format_version_mcool__Cooler)ContactBinnercreate)lock)GenomeSegmentationparse_cooler_uri)merge_coolerscoarsen_coolerzoomify_cooler   )i  i  i  i'  ia  iP  i i i  i@B i%& i@KL i indexeslist[np.ndarray | h5py.Dataset]bufsizeintreturntuple[np.ndarray, np.ndarray]c                 C  s  t | d j}tt| D ]}|| | 7 }qd}|d }dg}dg}d}	 t|t|| ||dd }	|	|kr<|	d7 }	||	 |||	  ||	 |krOn|	}||	 }q&t |}t |}t 	|}
|
|k
 }|dkrt| d| d|
 dd	 ||fS )
a  
    Given ``k`` bin1_offset indexes, determine how to partition the data from
    ``k`` corresponding pixel tables for a k-way merge.

    The paritition is a subsequence of bin1 IDs, defining the bounds of chunks
    of data that will be loaded into memory from each table in a single "epoch"
    of merging data. The bounds are calculated such that no single epoch will
    load more than ``bufsize`` records into memory.

    However, the ``bufsize`` condition is not guaranteed and a warning will be
    raised if it cannot be satisfied for one or more epochs (see Notes).

    Parameters
    ----------
    indexes : sequence of 1D array-like of equal length
        Offset arrays that map bin1 IDs to their offset locations in a
        corresponding pixel table.
    bufsize : int
        Maximum number of pixel records loaded into memory in a single merge
        epoch.

    Returns
    -------
    bin1_partition : 1D array
        Bin1 IDs defining where to partition all the tables for merging.
    cum_nrecords : 1D array
        Cumulative number of records (from all pixel tables combined) that will
        be processed at each epoch.

    Notes
    -----
    The one exception to the post-condition is when a single bin1 increment in
    a table contains more than ``bufsize`` records.
    r   T)lor	   z/ merge epochs will require buffering more than z  pixel records, with as many as g.)npZzerosshaperangelenr   minappendarraydiffsumwarningswarnmax)r   r   Zcombined_indexiZcombined_startZcombined_nnzbin1_partitioncum_nrecordsr   hinrecords_per_epochZn_over r3   V/var/www/html/software/conda/envs/catlas/lib/python3.10/site-packages/cooler/reduce.pymerge_breakpoints.   sH   +



r5   c                   @  s*   e Zd ZdZ		ddddZdddZdS )CoolerMergerz,
    Implementation of cooler merging.

    Ncoolerslist[Cooler]mergebufr   columnslist[str] | Noneaggdict[str, Any] | Nonec           
      C  s.  t || _|| _|d u rdgn|| _dd | jD | _|d ur%| j| |d j}|d ur\tdd |D dkr=td|d j	}t
dt|D ]}t|| j	|ksYtd	qId S |d  g d
 d d  }t
dt|D ]#}||  g d
 d d  }	t|	t|kst|	|kstdqqd S )Ncountc                 S     i | ]}|d qS r*   r3   .0colr3   r3   r4   
<dictcomp>       z)CoolerMerger.__init__.<locals>.<dictcomp>r   c                 S  s   h | ]}|j qS r3   )binsizerB   cr3   r3   r4   	<setcomp>   rE   z(CoolerMerger.__init__.<locals>.<setcomp>r	   z%Coolers must have the same resolutionz&Coolers must have the same chromosomeschromstartendz$Coolers must have same bin structure)listr7   r9   r:   r<   updaterF   r%   
ValueError
chromsizesr$   r"   allbins)
selfr7   r9   r:   r<   rF   rQ   r.   rS   Zbins2r3   r3   r4   __init__   s.   


zCoolerMerger.__init__r   Iterator[dict[str, np.ndarray]]c           
      #  s"   dd | j D }t|| j\}}t|}tdt|  td|  td|  dd | j D }td|  dgt| j  }|d	d  D ]@  fd
d|D }t	j
dd t| j ||D ddd}|jddgdd| j }	dd |	 D V  td|  |}qNd S )Nc                 S  s   g | ]	}| d d qS )rindexes/bin1_offset)openrG   r3   r3   r4   
<listcomp>   s    z)CoolerMerger.__iter__.<locals>.<listcomp>zn_merge_epochs: zbin1_partition: znrecords_per_merge_epoch: c                 S  s   g | ]}t | qS r3   )r%   pixelsrG   r3   r3   r4   rZ          znnzs: r   r	   c                   s   g | ]}|  qS r3   r3   )rB   indexbin1_idr3   r4   rZ          c                 S  s.   g | ]\}}}|| d kr|  || qS )r   )r[   )rB   rH   rL   stopr3   r3   r4   rZ      s
    T)ZaxisZignore_indexr_   bin2_idsortc                 S     i | ]\}}||j qS r3   valuesrB   kvr3   r3   r4   rD      r\   z)CoolerMerger.__iter__.<locals>.<dictcomp>zrecords consumed: )r7   r5   r9   r"   r)   loggerinfor%   debugpdconcatzipgroupby	aggregater<   reset_indexitems)
rT   r   r/   r0   r2   ZnnzsZstartsstopscombineddfr3   r^   r4   __iter__   s6   
zCoolerMerger.__iter__)NN)r7   r8   r9   r   r:   r;   r<   r=   r   rV   )__name__
__module____qualname____doc__rU   rx   r3   r3   r3   r4   r6      s    	r6   
output_uristr
input_uris	list[str]r9   r:   r;   dtypesr=   r<   Nonec                 K  sL  t dd| dd |D }dd |D }t|r d}	nt|s'd}	ntd|d	u r2d
g}tt}
|D ]%}|	 j
}|D ]}||vrStd| d|j d|
| ||  qAq8|d	u rdi }|D ]}||vrutj|
|  ||< qf|d  g d d	d	 }|d jdd	}t||||d}t| ||f||||	d| d	S )a  
    Merge multiple coolers with identical axes.

    The merged cooler is stored at ``output_uri``.

    .. versionadded:: 0.8.0

    Parameters
    ----------
    output_uri : str
        Output cooler file path or URI.
    input_uris : list of str
        List of input file path or URIs of coolers to combine.
    mergebuf : int
        Maximum number of pixels processed at a time.
    columns : list of str, optional
        Specify which pixel value columns to include in the aggregation.
        Default is to use all available value columns.
    dtypes : dict, optional
        Specific dtypes to use for value columns. Default is to propagate
        the current dtypes of the value columns.
    agg : dict, optional
        Functions to use for aggregating each value column. Pass the same kind
        of dict accepted by ``pandas.DataFrame.groupby.agg``. Default is to
        apply 'sum' to every value column.
    kwargs
        Passed to ``cooler.create``.

    Notes
    -----
    The default output file mode is 'w'. If appending output to an existing
    file, pass `mode='a'`.

    See also
    --------
    cooler.coarsen_cooler
    cooler.zoomify_cooler

    zMerging:
{}
c                 S  s   g | ]}t |qS r3   r   )rB   pathr3   r3   r4   rZ     r`   z!merge_coolers.<locals>.<listcomp>c                 S  s   g | ]}|j d kqS )symmetric-upper)storage_mode)rB   clrr3   r3   r4   rZ   
  s    TFz1Cannot merge symmetric and non-symmetric coolers.Nr>   Pixel value column '' not found in input ''.r   rJ   zgenome-assembly)r9   r:   r<   )r:   r   assemblysymmetric_upper)rk   rl   formatjoinrR   anyrP   r   rN   r[   r   filenamer'   r"   Zresult_typerS   getr6   r   )r~   r   r9   r:   r   r<   kwargsZclrsZis_symmr   Z	dtype_mapr   Zpixel_dtypesrC   rS   r   iteratorr3   r3   r4   r      sV   1
	
r   edges
np.ndarraymaxlenc                   s   t | } t| dkr| d dksJ t jdt t | f } fddtdtt |d   D }|	|d  t 
t ||}| | S )zGiven an integer interval partition ``edges`` from 0..nnz, prune the
    edges to make the new subintervals roughly ``maxlen`` in length.
       r   c                   s   g | ]} | qS r3   r3   )rB   r.   r   r3   r4   rZ   ?  r`   z+_greedy_prune_partition.<locals>.<listcomp>r   )r"   Zasarrayr%   r_Zcumsumr)   r$   r   ceilr'   uniquesearchsorted)r   r   Zcumlencutsidxr3   r   r4   _greedy_prune_partition8  s   
*r   rQ   	pd.Seriesbase_binsizebins_per_tilec                 C  s6   || }t | }t|| }ttt|}|S )aH  
    Number of zoom levels for a quad-tree tiling of a genomic heatmap.

    At the base resolution, we need N tiles, where N is the smallest power of
    2 such that the tiles fully cover the 1D data extent. From that starting
    point, determine the number of zoom levels required to "coarsen" the map
    up to 1 tile.

    )r*   mathr   r   r"   log2)rQ   r   r   Ztile_length_bpZtotal_bpZn_tilesZn_zoom_levelsr3   r3   r4   get_quadtree_depthE  s
   r   rL   mulIterator[int]c                 c  s,    t | t |} }| V  	 | |9 } | V  q)z
    Generate a geometric progression of integers.

    Beginning with integer ``start``, yield an unbounded geometric progression
    with integer ratio ``mul``.

    r   rL   r   r3   r3   r4   geomprog`  s   r   c                 c  s0    t | } | V  	 dD ]}| | V  q| d9 } q	)z
    Generate a nice progression of integers.

    Beginning with integer ``start``, yield a sequence of "nicely" spaced
    integers: an unbounded geometric progression with ratio 10, interspersed
    with steps of ratios 2 and 5.

    T)r      
   r   r   r   r3   r3   r4   niceprogo  s   	r   nicera   styleLiteral['binary', 'nice']	list[int]c                 C  sp   | |krg S |dkrt | d}n|dkrt| }ntd| d t|g}	 t|}||kr2	 |S || q')a  
    Return a sequence of integers with a "preferred" stepping pattern.

    Parameters
    ----------
    start : int
        Starting value in the progression.
    stop : int
        Upper bound of progression, inclusive. Values will not exceed this.
    style : {'nice', 'binary'}
        Style of progression. 'nice' gives geometric steps of 10 with 2 and 5
        in between. 'binary' gives geometric steps of 2.

    Returns
    ------
    list of int

    Examples
    --------
    For certain values of `start` (n * 10^i), nice stepping produces familiar
    "preferred" sequences [1]_:

    Note denominations in Dollars (1-2-5)

        >>> preferred_sequence(1, 100, 'nice')
        [1, 2, 5, 10, 20, 50, 100]


    Coin denominations in Cents

        >>> preferred_sequence(5, 100, 'nice')
        [5, 10, 25, 50, 100]

    .. [1] https://en.wikipedia.org/wiki/Preferred_number#1-2-5_series

    binaryr   r   z1Expected style value of 'binary' or 'nice'; got 'r   )r   r   rP   nextr'   )rL   ra   r   genseqnr3   r3   r4   preferred_sequence  s   )


r   resolutionsbaseslist[int] | None)tuple[np.ndarray, np.ndarray, np.ndarray]c                 C  s  |du r
t | h}nt|}tt|| }tjt|td }tjt|td }t	t
|ddd D ])\}}|d }|dkr`|||  dkrX|||< |||  ||< n|d8 }|dksCq7t
|D ]\}}|dkr|| |vrtd||  d| dqe|||fS )	a"  
    From a set of target resolutions and one or more base resolutions
    deduce the most efficient sequence of integer multiple aggregations
    to satisfy all targets starting from the base resolution(s).

    Parameters
    ----------
    resolutions: sequence of int
        The target resolutions
    bases: sequence of int, optional
        The base resolutions for which data already exists.
        If not provided, the smallest resolution is assumed to be the base.

    Returns
    -------
    resn: 1D array
        Resolutions, sorted in ascending order.
    pred: 1D array
        Index of the predecessor resolution in `resn`. A value of -1 implies
        that the resolution is a base resolution.
    mult: 1D array
        Multiplier to go from predecessor to target resolution.

    N)Zdtyper   r	   r   zResolution z. cannot be derived from the base resolutions: r!   )r&   setr"   r(   sortedunionZonesr%   r   rN   	enumeraterP   )r   r   resnpredmultr.   targetpr3   r3   r4   get_multiplier_sequence  s2   
r   c                   @  sJ   e Zd ZdZefd!ddZed"ddZd#ddZd#ddZ	d$ddZ
d S )%CoolerCoarsenerz/
    Implementation of cooler coarsening.

    
source_urir   factorr   	chunksizer:   r   r<   r=   	batchsizemapr   c                 C  sl  || _ || _|| _t|tr|dksJ || _t|| _ddg| _t|| _	| j| j	 | _
dd | j	D | _|d ur@| j| t|}|j}	|j| _|d| _|d| _| jd u r`d | _n| j| | _| g d d d  }
| |
|	|| _t|	| j| _g }| jj D ]\}}| j| }| j|d  }|| j|||  q|| jd	  t|| j| _d S )
Nr	   r_   rb   c                 S  r?   r@   r3   rA   r3   r3   r4   rD     rE   z,CoolerCoarsener.__init__.<locals>.<dictcomp>zindexes/chrom_offsetrX   rJ   r   ) _mapr   r   
isinstancer   r   r   index_columnsrN   Zvalue_columnsr:   r<   rO   r   rQ   rF   Zold_binsizeZ
_load_dsetZold_chrom_offsetZold_bin1_offsetZnew_binsizerS   coarsen_binsnew_binsr   gsZidmaprt   extendr'   r   r   )rT   r   r   r   r:   r<   r   r   r   rQ   old_binsr   Z_chromr.   Zc0c1r3   r3   r4   rU     s<   





zCoolerCoarsener.__init__r   pd.DataFramerQ   r   r   c                   s2    fdd}| j dddg d |jddS )Nc                   sf   | ddg   jd d  }| d jd d  j}t|t|k r-tj| | j f }||d< |S )NrK   rL   rM   r	   )copyZilocrg   r%   r"   r   name)groupoutrM   rQ   r   r3   r4   _each9  s   z+CoolerCoarsener.coarsen_bins.<locals>._eachrK   TZobservedrJ   )Zdrop)rq   applyrs   )r   rQ   r   r   r3   r   r4   r   5  s   	zCoolerCoarsener.coarsen_binsspantuple[int, int]c                 C  sD  |\}}t | j}|jddd| j }||| }t| d|  | jj}| jj}| jj	}	| jj
}
|d j}|d j}|d j}|d j}|d u rp|	| | }|	| | }tj|
|d	d
d |d< tj|
|d	d
d |d< n$t|| t}t|| t}|| | |d< || | |d< |j| jdd| j S )NTF)r   Zconvert_enum Zchrom1Zchrom2start1start2right)Zsider	   r_   rb   rc   )r   r   r[   r:   rk   rl   r   rF   chrom_binoffsetchrom_absposstart_absposrg   r"   r   floorZastyper   rq   r   rr   r<   rs   )rT   r   r   r1   r   tablechunkrF   r   r   r   Z	chrom_id1Z	chrom_id2r   r   Z
abs_start1Z
abs_start2Zrel_bin1Zrel_bin2r3   r3   r4   
_aggregateH  s8   




zCoolerCoarsener._aggregatec              
   C  s8   z|  |}W |S  ty } ztt||d }~ww N)r   MemoryErrorRuntimeErrorr   )rT   r   r   er3   r3   r4   rr   p  s   zCoolerCoarsener.aggregaterV   c              	   c  s    | j }tt| jd d | jdd  }tdt||D ];}z|dkr(t  | | j	||||  }W |dkr>t
  n
|dkrHt
  w w |D ]}dd | D V  qKqd S )Nr   r	   r   c                 S  re   r3   rf   rh   r3   r3   r4   rD     r\   z,CoolerCoarsener.__iter__.<locals>.<dictcomp>)r   rN   rp   r   r$   r%   r   acquirer   rr   releasert   )rT   r   spansr.   resultsrw   r3   r3   r4   rx   w  s$   "
zCoolerCoarsener.__iter__N)r   r   r   r   r   r   r:   r   r<   r=   r   r   r   r   )r   r   rQ   r   r   r   r   r   )r   r   r   r   ry   )rz   r{   r|   r}   r   rU   staticmethodr   r   rr   rx   r3   r3   r3   r4   r     s    8

(r   base_urir   r   nprocc              	   K  s  t | }	t|}|du rdg}|du ri }|	 j}
|D ]}||
vr.td| d|	j d|||
|  qzG|dkrGt|}|dt	 t
| ||||||dkrU|jntd}|j}|d	d
 t|||f||	jdkd| W |dkr}|  dS dS |dkr|  w w )aP  
    Coarsen a cooler to a lower resolution by an integer factor *k*.

    This is done by pooling *k*-by-*k* neighborhoods of pixels and aggregating.
    Each chromosomal block is coarsened individually. Result is a coarsened
    cooler stored at ``output_uri``.

    .. versionadded:: 0.8.0

    Parameters
    ----------
    base_uri : str
        Input cooler file path or URI.
    output_uri : str
        Input cooler file path or URI.
    factor : int
        Coarsening factor.
    chunksize : int
        Number of pixels processed at a time per worker.
    nproc : int, optional
        Number of workers for batch processing of pixels. Default is 1,
        i.e. no process pool.
    columns : list of str, optional
        Specify which pixel value columns to include in the aggregation.
        Default is to use all available value columns.
    dtypes : dict, optional
        Specific dtypes to use for value columns. Default is to propagate
        the current dtypes of the value columns.
    agg : dict, optional
        Functions to use for aggregating each value column. Pass the same kind
        of dict accepted by ``pandas.DataFrame.groupby.agg``. Default is to
        apply 'sum' to every value column.
    kwargs
        Passed to ``cooler.create``.

    See also
    --------
    cooler.zoomify_cooler
    cooler.merge_coolers

    Nr>   r   r   r   r	   r   )r:   r<   r   r   r'   Tr   )r   r   )r   r   r[   r   rP   r   
setdefaultmpZPoolr   r   r   r   r   r   close)r   r~   r   r   r   r:   r   r<   r   r   Zinput_dtypesrC   poolr   r   r3   r3   r4   r     sV   6




r   	base_urisstr | list[str]outfilec                 K  s  t | tr| g} i }	i }
t }| D ]5}t|\}}t||}|jdu r%dn|j}||f|	|< | dd jddd 	 |
|< |
| qt||\}}}t|}td| d |du rcdg}|D ]}td	t|  |	| \}}t|d
k}t|dU}d| }||d ||d  ||d ||d  ddgt|D ]}||d|  ||d|   q||d ||d  || j|| j W d   n1 sw   Y  W d   n1 sw   Y  qet|D ]>}|| dkrq|||  }|||  }td| d| d t|d|  |d|  || |f||||dd| qt|d}|jdtd W d   dS 1 sOw   Y  dS )a  
    Generate multiple cooler resolutions by recursive coarsening.

    Result is a "zoomified" or "multires" cool file stored at ``outfile``
    using the MCOOL v2 layout, where coolers are stored under a hierarchy of
    the form ``resolutions/<r>`` for each resolution ``r``.

    .. versionadded:: 0.8.0

    Parameters
    ----------
    base_uris : str or sequence of str
        One or more cooler URIs to use as "base resolutions" for aggregation.
    outfile : str
        Output multires cooler (mcool) file path.
    resolutions : list of int
        A list of target resolutions to generate.
    chunksize : int
        Number of pixels processed at a time per worker.
    nproc : int, optional
        Number of workers for batch processing of pixels. Default is 1,
        i.e. no process pool.
    columns : list of str, optional
        Specify which pixel value columns to include in the aggregation.
        Default is to use only the column named 'count' if it exists.
    dtypes : dict, optional
        Specific dtypes to use for value columns. Default is to propagate
        the current dtypes of the value columns.
    agg : dict, optional
        Functions to use for aggregating each value column. Pass the same kind
        of dict accepted by ``pandas.DataFrame.groupby.agg``. Default is to
        apply 'sum' to every value column.
    kwargs
        Passed to ``cooler.create``.

    See also
    --------
    cooler.coarsen_cooler
    cooler.merge_coolers

    Nr	   rK   Tr   z$Copying base matrices and producing z new zoom levels.r>   z
Bin size: rW   wz/resolutions/z/chromsz/binsr_   rb   z/pixels/z/indexesr   zAggregating from z to r!   z::resolutions/r+)r   r:   r   r<   modezHDF5::MCOOL)r   zformat-version)r   r   r   r   r   rF   rS   rq   sizer-   addr   r%   rk   rl   h5pyFiler   rN   attrsrO   r$   r   r   )r   r   r   r   r   r:   r   r<   r   Zparsed_urisZn_bins_longest_chromZbase_resolutions	input_uriinfileingroupr   r   r   r   r   n_zoomssrcdestprefixrC   r.   Zprev_binsizerF   fwr3   r3   r4   r     s   
5


 
$r   r  tuple[int, dict[str, int]]c              	   C  s   t | \}}t||}t|j|jt}d}	tdt	|j  td|j  td|  tdd|   td| d| d d	  t
 }
t|}|j}td
t| d t|  t|d+}t|d}|||t| ||
|< W d   n1 sw   Y  W d   n1 sw   Y  t|d ddD ]9}||	9 }t|d }t|}tdt| d t|  t|d t| |d t| |	|||d ||
|< qt|d}|jd|i ||jd< |j|
 W d   ||
fS 1 sw   Y  ||
fS )zL
    Quad-tree tiling using legacy MCOOL layout (::0, ::1, ::2, etc.).

    r   ztotal_length (bp): z	binsize: z	n_zooms: zquad tile cover: zCopying base matrix to level z and producing z new zoom levels zcounting down to 0...zZoom level: z bin size: rW   r   Nr	   r   zAggregating at zoom level: z::)r   r   r   r   zmax-zoomz	max-zooms)r   r   r   rQ   rF   HIGLASS_TILE_DIMrk   rl   r"   r*   r   r   r  r  r   r$   r   r  rO   )r  r   r   r   r   r  r  r   r	  r   Zzoom_levelsZ	zoomLevelrF   r
  r  r.   Z	prevLevelr  r3   r3   r4   legacy_zoomifyr  st   


 


r  )r   r   r   r   r   r   )NNN)r~   r   r   r   r9   r   r:   r;   r   r=   r<   r=   r   r   )r   r   r   r   r   r   )rQ   r   r   r   r   r   r   r   )rL   r   r   r   r   r   )rL   r   r   r   )r   )rL   r   ra   r   r   r   r   r   r   )r   r   r   r   r   r   )r	   NNN)r   r   r~   r   r   r   r   r   r   r   r:   r;   r   r=   r<   r=   r   r   )r   r   r   r   r   r   r   r   r   r   r:   r;   r   r=   r<   r=   r   r   )
r  r   r   r   r   r   r   r   r   r  )5
__future__r   r   r+   bisectr   collectionsr   r   typingr   r   r   r  Zmultiprocessr   numpyr"   Zpandasrn   Z_loggingr
   Z_typingr   _versionr   apir   r   r   Zparallelr   utilr   r   __all__rz   rk   r  Z	ZOOMS_4DNr5   r6   r   r   r   r   r   r   r   r   r   r   r  r3   r3   r3   r4   <module>   sd    
XS
c


?: p 