
    DUf                       d Z ddlmZmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlZddlmZ ddlZddlmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ  ej        dej        j                    ej         ej!                   dZ"dZ#d Z$d Z%dZ&dZ'g dZ(d Z)d Z*d Z+e"fdZ,	 	 	 	 	 d6dZ-d7dZ.d Z/d Z0d  Z1e"fd!Z2d" Z3e"fd#Z4d$e"fd%Z5e"d&d'd(d)fd*Z6ee7fd+            Z8edde7fd,            Z9	 	 	 	 	 	 	 	 	 	 	 	 d8d5Z:dS )9a  
Collection of functions related to dot-calling

The main user-facing API function is:

.. code-block:: python

    dots(
        clr,
        expected,
        expected_value_col="balanced.avg",
        clr_weight_name="weight",
        view_df=None,
        kernels=None,
        max_loci_separation=10_000_000,
        max_nans_tolerated=1,
        n_lambda_bins=40,
        lambda_bin_fdr=0.1,
        clustering_radius=20_000,
        cluster_filtering=None,
        tile_size=5_000_000,
        nproc=1,
    )

This function implements HiCCUPS-style dot calling, but enables user-specified
modifications at multiple steps. The current implementation makes two passes
over the input data, first to create a histogram of pixel enrichment values, 
and second to extract significantly enriched pixels.

- The function starts with compatibility verifications   
- Recommendation or verification for `kernels` is done next.  
  Custom kernels must satisfy properties including: square shape,
  equal sizes, odd sizes, zeros in the middle, etc. By default,
  HiCCUPS-style kernels are recommended based on the binsize.
- Lambda bins are defined for multiple hypothesis  
  testing separately for different value ranges of the locally adjusted expected.
  Currently, log-binned lambda-bins are hardcoded using a pre-defined
  BASE of 2^(1/3). `n_lambda_bins` controls the total number of bins.
  for the `clr`, `expected` and `view` of interest.
- Genomic regions in the specified `view`(all chromosomes by default)  
  are split into smaller tiles of size `tile_size`.
- `scoring_and_histogramming_step()` is performed independently  
  on the genomic tiles. In this step, locally adjusted expected is
  calculated using convolution kernels for each pixel in the tile.
  All surveyed pixels are histogrammed according to their adjusted 
  expected and raw observed counts. Locally adjusted expected is 
  not stored in memory.
- Chunks of histograms are aggregated together and a modified BH-FDR  
  procedure is applied to the result in `determine_thresholds()`.
  This returns thresholds for statistical significance 
  in each lambda-bin (for observed counts), along with the adjusted
  p-values (q-values).
- Calculated thresholds are used to extract statistically significant  
  pixels in `scoring_and_extraction_step()`. Because locally adjusted
  expected is not stored in memory, it is re-caluclated
  during this step, which makes it computationally intensive.
  Locally adjusted expected values are required in order to apply
  different thresholds of significance depending on the lambda-bin.
- Returned filtered pixels, or 'dots', are significantly enriched  
  relative to their locally adjusted expecteds and thus have potential
  biological interest. Dots are further annotated with their 
  genomic coordinates and q-values (adjusted p-values) for
  all applied kernels.
- All further steps perform optional post-processing on called dots

  - enriched pixels that are within `clustering_radius` of each other  
    are clustered together and the brightest one is selected as the
    representative position of a dot.
  - cluster-representatives along with "singletons" (enriched pixels  
    that are not part of any cluster) can be subjected to further
    empirical enrichment filtering in `cluster_filtering_hiccups()`. This 
    both requires clustered dots exceed prescribed enrichment thresholds 
    relative to their local neighborhoods and that singletons pass an 
    even more stringent q-value threshold.
    )partialreduceN)convolve)poisson)Birch   )LazyToeplitz
get_kernel)assign_regionsmake_cooler_view)is_cooler_balancedis_compatible_viewframeis_valid_expected)pool_decoratorignore)actioncategory)levelcountzexp.rawc                     d|  dS )Nla_exp..value kernel_names    T/var/www/html/software/conda/lib/python3.11/site-packages/cooltools/api/dotfinder.py<lambda>r   i   s    (E+(E(E(E     c                     d|  dS )Nr   .nnansr   r   s    r   r   r   j   s    )F;)F)F)F r   bin1_idbin2_id)chrom1start1end1chrom2start2end2c                 &    t          | |z            S N)int)	basepairsbinsizes     r   
bp_to_binsr.   x   s    y7"###r   c                 
   | dk    rt          d|  d          | dk    rd\  n+| dk    rd\  n| dk    rd	\  nt          d
|  d          t          j        d d d|             g d}fd|D             }|S )a:  
    Return a recommended set of convolution kernels for dot-calling
    based on the resolution, or binsize, of the input data.

    This function currently recommends the four kernels used in the HiCCUPS method:
    donut, horizontal, vertical, lowerleft. Kernels are recommended for resolutions
    near 5 kb, 10 kb, and 25 kb. Dots are not typically visible at lower resolutions
    (binsize >28kb) and the majority of datasets are too sparse for dot-calling
    at very high resolutions (<4kb). Given this, default kernels are not
    recommended for resolutions outside this range.

    Parameters
    ----------
    binsize : integer
        binsize of the provided cooler

    Returns
    -------
    kernels : {str:ndarray}
        dictionary of convolution kernels as ndarrays, with their
        names as keys.
    i`m  z"Provided cooler has resolution of zc bases, which is too coarse for automated kernel recommendation. Provide custom kernels to proceed.iPF  )      i@  )   r   i  )      zProvided cooler has resolution za bases, which is too fine for automated kernel recommendation. Provide custom kernels to proceed.z-Using recommended donut-based kernels with w=z, p=z for binsize=)donutvertical
horizontallowleftc                 4    i | ]}|t          |          S r   )r
   ).0kpws     r   
<dictcomp>z%recommend_kernels.<locals>.<dictcomp>   s'    <<<!q*Q1%%<<<r   )
ValueErrorlogginginfo)r-   kernel_typeskernelsr<   r=   s      @@r   recommend_kernelsrD   |   s   2 2 2 2 2
 
 	

 
E		11	D11	D11g   
 
 	

 LXXXqXXwXX  
 BAAL =<<<<|<<<GNr   c                    t          | t                    st          d          d |                                 D             }t	          |          t          |          k    rt          d|           t          |          }|dz
  dz  }|dk    s|                                st          d|           ||k    st          d|           d	S )
a   
    TODO implement checks for kernels:
     - matrices are of the same size
     - they should be squared (too restrictive ? maybe pad with 0 as needed)
     - dimensions are odd, to have a center pixel to refer to
     - they can be turned into int 1/0 ones (too restrictive ? allow weighted kernels ?)
     - the central pixel should be zero perhaps (unless weights are allowed 4sure)
     - maybe introduce an upper limit to the size - to avoid crazy long calculations
     - check relative to the binsize maybe ? what's the criteria ?
    zA'kernels' must be a dictionarywith name-keys and ndarrays-values.c                 2    g | ]\  }}t          |          S r   len)r:   knr;   s      r   
<listcomp>z)is_compatible_kernels.<locals>.<listcomp>   s"    999ASVV999r   z,all 'kernels' must have the same size, now: r1   r   r   zASize of the convolution kernels has to be odd and > 3, currently z)Too many NaNs allowed max_nans_tolerated=T)
isinstancedictr?   itemsminmax
is_integer)rC   r-   max_nans_toleratedkernel_widthskernel_widthkernel_half_widths         r   is_compatible_kernelsrU      s    gt$$ 
R
 
 	

 :9999M
=S////WWWXXX}%%L%)Q.Q'8'C'C'E'E^P\^^
 
 	
 --L8JLL
 
 	
 4r   c           	         |                                  }t          |j                  }|                                D ]\  }}t	          j        |j                  }t	          j        |d| d         |          |d<   |                    |                    d          	                                |dg|d| dgd          }d| d	}|
                    d
|i          }|                    |           |j        dd|f         S )a  
    Add columns with the qvalues to a DataFrame of scored pixels

    Parameters
    ----------
    pixels_df : pandas.DataFrame
        a DataFrame with pixel coordinates that must have at least 2 columns
        named 'bin1_id' and 'bin2_id', where first is pixels's row and the
        second is pixel's column index.
    qvalues : dict of DataFrames
        A dictionary with keys being kernel names and values DataFrames
        storing q-values for each observed count values in each lambda-
        bin. Colunms are Intervals defined by 'ledges' boundaries.
        Rows corresponding to a range of observed count values.
    obs_raw_name : str
        Name of the column/field that carry number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    pixels_qvalue_df : pandas.DataFrame
        DataFrame of pixels with additional columns la_exp.{k}.qval,
        storing q-values (adjusted p-values) corresponding to the count
        value of a pixel, its kernel, and a lambda-bin it belongs to.
    r   r   )binslbinsFignore_index) _)left_onright_onsuffixes.qvalvaluecolumnsN)copylistrc   rM   pdIntervalIndexcutmergemeltreset_indexrenameappendloc)		pixels_dfqvaluesobs_raw_namepixels_qvalue_dfcolsr;   qval_dfrX   qval_col_names	            r   annotate_pixels_with_qvaluesrv      s8   6 !~~'' ())Dmmoo # #
7 11$&F0q0001%
 %
 %
! ,11LLeL,,88::!7+"$7a$7$7$78 2 
 
 +!***+22G];S2TTM"""" 4((r   c_labelc_sizec           	         | ||g         j                             t          j                  }| j        }t          d|d          }|                    |           |j        }	|j        }
t          j	        |	dd          \  }}}||         }t          j
        |
|	d          }t          j        d|j         d|                                d	d
|                                d	d           t!          j        ||d|z   d|z   g          }|	                    t          j                  ||<   |                    t          j                  ||<   |S )a  
    Group significant pixels by proximity using Birch clustering. We use
    "n_clusters=None", which implies no AgglomerativeClustering, and thus
    simply reporting "blobs" of pixels of radii <="threshold_cluster" along
    with corresponding blob-centroids as well.

    Parameters
    ----------
    pixels_df : pandas.DataFrame
        a DataFrame with pixel coordinates that must have at least 2 columns
        named 'bin1_id' and 'bin2_id', where first is pixels's row and the
        second is pixel's column index.
    threshold_cluster : int
        clustering radius for Birch clustering derived from ~40kb radius of
        clustering and bin size.
    bin1_id_name : str
        Name of the 1st coordinate (row index) in 'pixel_df', by default
        'bin1_id'. 'start1/end1' could be usefull as well.
    bin2_id_name : str
        Name of the 2nd coordinate (column index) in 'pixel_df', by default
        'bin2_id'. 'start2/end2' could be usefull as well.
    clust_label_name : str
        Name of the cluster of pixels label. "c_label" by default.
    clust_size_name : str
        Name of the cluster of pixels size. "c_size" by default.

    Returns
    -------
    peak_tmp : pandas.DataFrame
        DataFrame with the following columns:
        [c+bin1_id_name, c+bin2_id_name, clust_label_name, clust_size_name]
        row/col (bin1/bin2) are coordinates of centroids,
        label and sizes are unique pixel-cluster
        labels and their corresponding sizes.
    NT)
n_clusters	thresholdcompute_labels)return_inversereturn_countsr   axisz	detected z clusters of z.2fz+/-z sizec)indexrc   )valuesastypenpfloat64r   r   fitlabels_subcluster_centers_uniquetaker@   rA   sizemeanstdrf   	DataFrameint64)ro   threshold_clusterbin1_id_namebin2_id_nameclust_label_nameclust_size_namepixels
pixel_idxsbrcclustered_labelsclustered_centroidsuniq_labelsinverse_idxuniq_countscluster_sizescentroids_per_pixelcentroids_n_labels_dfs                    r   clust_2D_pixelsr     s   Z l34;BB2:NNF J #	  C GGFOOO {1,.IT- - -)Kk  ,M'"57GaPPP LjK$jj;3C3C3E3EjjjkooN_N_jjjj  
 L|#S<%78   /?.E.Ebh.O.O*+-:-A-A"(-K-K/*  r   c              #     K   | |z  r	| |z  dz   }n| |z  }t          j        d|  d|  d||z   d| d| d           |rt          j        d| d           t          |          D ]}t          |          D ]v}t          d	||z  |z
            }t          d	||z  |z
            }t	          | ||dz   z  |z             }	t	          | ||dz   z  |z             }
||z   |	|z   f||z   |
|z   ffV  wd
S )ay  
    Generate a stream of coordinates of tiles that cover a matrix of a given size.
    Matrix has to be square, on-digaonal one: e.g. corresponding to a chromosome
    or a chromosomal arm.

    Parameters
    ----------
    matrix_size : int
        Size of a squared matrix
    offset : int
        Offset coordinates of generated tiles by 'offset'
    tile_size : int
        Requested size of the tiles. Tiles near
        the right and botoom edges could be rectangular
        and smaller then 'tile_size'
    pad : int
        Small padding around each tile to be included in the yielded coordinates.

    Yields
    ------
    Pairs of indices/coordinates of every tile: (start_i, end_i), (start_j, end_j)

    Notes
    -----
    Generated tiles coordinates [start_i,end_i) , [start_i,end_i)
    can be used to fetch heatmap tiles from cooler:
    >>> clr.matrix()[start_i:end_i, start_j:end_j]

    'offset' is useful when a given matrix is part of a
    larger matrix (a given chromosome or arm), and thus
    all coordinated needs to be offset to get absolute
    coordinates.

    Tiles are non-overlapping (pad=0), but tiles near
    the right and bottom edges could be rectangular:

    * * * * * * * * *
    *     *     *   *
    *     *     *   *
    * * * * * * *   *
    *     *         *
    *     *  ...    *
    * * * *         *
    *               *
    * * * * * * * * *
    r1   z matrix Xz to be split into z
 tiles of .z tiles are padded (width=z&) to enable convolution near the edgesr   N)r@   debugrangerO   rN   )matrix_sizeoffset	tile_sizepad	num_tilestitjstart_istart_jend_iend_js              r   tile_square_matrixr     s     ` Y -9,q0		9,	My;yyyy	I@Uyyajyymvyyy    
SSSS	
 	
 	

 I 	Y 	Y	"" 	Y 	YB!Y^c122G!Y^c122GY"q&%9C%?@@EY"q&%9C%?@@EV#UV^4w7GQW6XXXXXX	Y	Y 	Yr   c              #   b  K   |                     d          D ]\  }}}}|                     |||f          \  }	}
|
|	z
  }t          ||	||          D ]Z\  }}|d         |d         z
  }|d         |d         z
  }d|}}t          ||          t	          ||          z
  d|z  k    r|||fV  [dS )a  
    A generator yielding corrdinates of heatmap tiles that are needed to cover
    the requested band_to_cover around diagonal. Each tile is "padded" with
    the pad of size 'pad_size' to allow for convolution near the boundary of
    a tile.

    Parameters
    ----------
    clr : cooler
        Cooler object to use to extract chromosome extents.
    view_df : viewframe
        Viewframe with genomic regions to process, chrom, start, end, name.
    pad_size : int
        Size of padding around each tile. Typically the outer size of the
        kernel.
    tile_size : int
        Size of the heatmap tile.
    band_to_cover : int
        Size of the diagonal band to be covered by the generated tiles.
        Typically correspond to the max_loci_separation for called dots.
    Returns
    -------
    tile_coords : tuple
        Generator of tile coordinates, i.e. tuples of three:
        (region_name, tile_span_i, tile_span_j), where 'tile_span_i/j'
        each is a tuple of bin ids (bin_start, bin_end).
    F)r   )r   r   r   r   r   r1   r   N)
itertuplesextentr   rN   rO   )clrview_dfpad_sizer   band_to_coverchromstartendregion_nameregion_start
region_endregion_sizetile_span_itile_span_jtile_diag_starttile_diag_end
band_startband_ends                     r   generate_tiles_diag_bandr     s
     : +2*<*<5*<*I*I < <&uc;#&::ueS.A#B#B j </(:#	)
 )
 )
 	< 	<$K *!n{1~=O'N[^;M#$mJ Hm,,s:/O/OOH  ";;;;;#	<< <r   c                 L   | \  }}|}t          j        |          }t          |t           j                  r|}	|}
n1t          |t          t
          f          r|\  }	}
nt          d          t          j        |	|
          }t          j        ||          }t           j	        |t          j
        |||z
  dz
            <   t           j	        |t          j
        |||z
  dz
            <   t          j        ||          }t          j        t          j        |          t          j        |                    }d||<   d||<   d||<   t          j        |j                  \  }}t!          j        |                                |z   |                                |z   |                                d          }t          j        dd          5  |                                D ]*\  }}t+          ||ddd	
          }t+          ||ddd	
          }t+          |                    t           j                  |d	k                        t           j                  ddd	
          }t          j        ||          }t          j        ||          }t1          j        d| d| d| d           |                                |d| d<   |                                |d| d<   t          j        |                                          |d| <   ,	 ddd           n# 1 swxY w Y   |S )a  
    Get locally adjusted expected for a collection of local-filters (kernels).

    Such locally adjusted expected, 'Ek' for a given kernel,
    can serve as a baseline for deciding whether a given
    pixel is enriched enough to call it a feature (dot-loop,
    flare, etc.) in a downstream analysis.

    For every pixel of interest [i,j], locally adjusted
    expected is a product of a global expected in that
    pixel E_bal[i,j] and an enrichment of local environ-
    ment of the pixel, described with a given kernel:

    ::

                                  KERNEL[i,j](O_bal)
        Ek_bal[i,j] = E_bal[i,j]* ------------------
                                  KERNEL[i,j](E_bal)

    where KERNEL[i,j](X) is a result of convolution
    between the kernel and a slice of matrix X centered
    around (i,j). See link below for details:
    https://en.wikipedia.org/wiki/Kernel_(image_processing)

    Returned values for observed and all expecteds
    are rescaled back to raw-counts, for the sake of
    downstream statistical analysis, which is using
    Poisson test to decide is a given pixel is enriched.
    (comparison between balanced values using Poisson-
    test is intractable):

    ::

                                  KERNEL[i,j](O_bal)
        Ek_raw[i,j] = E_raw[i,j]* ------------------ ,
                                  KERNEL[i,j](E_bal)

    where E_raw[i,j] is:

    ::

              1               1
        -------------- * -------------- * E_bal[i,j]
        bal_weights[i]   bal_weights[j]


    Parameters
    ----------
    origin_ij : (int,int) tuple
        tuple of interegers that specify the
        location of an observed matrix slice.
        Measured in bins, not in nucleotides.
    observed : numpy.ndarray
        square symmetrical dense-matrix
        that contains balanced observed O_bal
    expected : numpy.ndarray
        square symmetrical dense-matrix
        that contains expected, calculated
        based on balanced observed: E_bal.
    bal_weights : numpy.ndarray or (numpy.ndarray, numpy.ndarray)
        1D vector used to turn raw observed
        into balanced observed for a slice of
        a matrix with the origin_ij on the diagonal;
        and a tuple/list of a couple of 1D arrays
        in case it is a slice with an arbitrary
        origin_ij.
    kernels : dict of (str, numpy.ndarray)
        dictionary of kernels/masks to perform
        convolution of the heatmap. Kernels
        describe the local environment, and
        used to estimate baseline for finding
        enriched/prominent peaks.
        Peak must be enriched with respect to
        all local environments (all kernels),
        to be considered significant.
        Dictionay keys must contain names for
        each kernel.
        Note, scipy.ndimage.convolve first flips kernel
        and only then applies it to matrix.

    Returns
    -------
    peaks_df : pandas.DataFrame
        DataFrame with the results of locally adjusted calculations
        for every kernel for a given slice of input matrix.

    Notes
    -----

    Reported columns:
        bin1_id - bin1_id index (row), adjusted to tile_start_i
        bin2_id - bin bin2_id index, adjusted to tile_start_j
        la_exp - locally adjusted expected (for each kernel)
        la_nan - number of NaNs around (each kernel's footprint)
        exp.raw - global expected, rescaled to raw-counts
        obs.raw(counts) - observed values in raw-counts.

    Depending on the intial tiling of the interaction matrix,
    concatened `peaks_df` may require "deduplication", as some pixels
    can be evaluated in several tiles (e.g. near the tile edges).
    Default tilitng in the `dots` functions, should avoid this problem.

    z'bal_weights' must be an numpy.ndarrayfor slices of a matrix with diagonal-origin ora tuple/list of a couple of numpy.ndarray-sfor a slice of matrix with an arbitrary origin.r1   )r;   g        r!   r"   r   r   )divideinvalidconstantr   )modecvaloriginzConvolution with kernel z is done for tile @  r   r   r   r    safe_division.N)r   rd   rK   ndarraytuplere   r?   outermultiplynantril_indices_fromr   
logical_orisnanindicesshaperf   r   ravelerrstaterM   r   r   r   r@   r   isfinite)	origin_ijobservedexpectedbal_weightsrC   iojoO_rawE_balv_bal_iv_bal_jbal_weights_ijO_balE_rawN_balijpeaks_dfr   kernelKOKENNlocal_adjustment_factorEk_raws                            r   $get_adjusted_expected_tile_some_nansr      sz   V FBEGHE+rz** 
	K%	/	/ 
&>
 
 	
 Xgw//N K~..E ;=&E"
ubA
6
6
67:<&E"
ubA
6
6
67 Ie^,,E
 M"(5//28E??;;E E%LE%LE%L :ek""DAq|		B17799r>\a\g\g\i\ijjkkH	Hh	7	7	7 -d -d#*==?? ,	d ,	dK %js1MMMB %js1MMMB
 RX&&1$$RX..  B ')iB&7&7#[(?@@FMV;VVBVVQSVVV   7=llnnH2{222368hhjjH2{222379{CZC`C`CbCb7c7cH3k3344Y,	d-d -d -d -d -d -d -d -d -d -d -d -d -d -d -d` Os   ELL Lc                    | \  }}	}
|	d         |
d         f}t          |j        ||f         |                                                   }|                    d          t	          |	 t	          |
 f         }|t	          |	 t	          |
 f         }|                                t	          |	          |                                         }|                                t	          |
          |                                         }t          |||||f|          }|d         |d         k     }|d         |d         |z
  k    }t          j        |d |D                      |k     d	          }|d
 |D                                          d	          }|||z  |z  |z           	                    d          }|g dd |D             z            
                    d |D                       S )ag  
    The main working function that given a tile of a heatmap, applies kernels to
    perform convolution to calculate locally-adjusted expected and then
    calculates a p-value for every meaningfull pixel against these
    locally-adjusted expected (la_exp) values.

    Parameters
    ----------
    tile_cij : tuple
        Tuple of 3: region name, tile span row-wise, tile span column-wise:
        (region, tile_span_i, tile_span_j), where tile_span_i = (start_i, end_i), and
        tile_span_j = (start_j, end_j).
    clr : cooler
        Cooler object to use to extract Hi-C heatmap data.
    expected_indexed : pandas.DataFrame
        DataFrame with cis-expected, indexed with 'region1', 'region2', 'dist'.
    expected_value_col : str
        Name of a value column in expected DataFrame
    clr_weight_name : str
        Name of a value column with balancing weights in a cooler.bins()
        DataFrame. Typically 'weight'.
    kernels : dict
        A dictionary with keys being kernels names and values being ndarrays
        representing those kernels.
    max_nans_tolerated : int
        Number of NaNs tolerated in a footprint of every kernel.
    band_to_cover : int
        Results would be stored only for pixels connecting loci closer than
        'band_to_cover'.

    Returns
    -------
    res_df : pandas.DataFrame
        results: annotated pixels with calculated locally adjusted expected
        for every kernels, observed, precalculated pvalues, number of NaNs in
        footprint of every kernels, all of that in a form of an annotated
        pixels DataFrame for eligible pixels of a given tile.

    r   F)balance)r   r   r   r   rC   r!   r"   c                     g | ]}d | d	S )r   r    r   r:   r;   s     r   rJ   zscore_tile.<locals>.<listcomp>0  s$    555#!###555r   r1   r   c                     g | ]}d | S )r   r   r   s     r   rJ   zscore_tile.<locals>.<listcomp>4  s!     G G G!!5!!5!5 G G Gr   rc   Tdropr   c                     g | ]}d | d	S r   r   r   r   s     r   rJ   zscore_tile.<locals>.<listcomp>>  s$    *P*P*P1+>Q+>+>+>*P*P*Pr   c                     i | ]	}d | dd
S )r   r   r   r   r   s     r   r>   zscore_tile.<locals>.<dictcomp>?  s&    DDDq'a'''DDDr   )dtype)r	   rn   to_numpymatrixslicerW   r   r   allrk   r   )tile_cijr   expected_indexedexpected_value_colclr_weight_namerC   rQ   r   r   r   r   tile_start_ijlazy_expr   r   bal_weight_ibal_weight_jresult
upper_bandis_inside_banddoes_comply_nansfinite_values_onlyres_dfs                          r   
score_tiler    s+   d -5)Kk ^[^4M
 [+567IJSSUU H
 zz%z(()<e[>Q)QRH{+UK-@@AH88::e[12?CLLNNL88::e[12?CLLNNL 2!<0  F 	"VI%66J I&&*;m*KLN v55W55569KKRS  
   G Gw G G GHLLR[L\\ J/2BBEWWXdd e  F '''*P*P*P*P*PPfDDGDDDfEEFr   c                 H   i }|D ]}t          j        | d| d         |          }|                     ||gdd          d| d                                                                                             d                              t          j                  ||<   |S )a  
    An attempt to implement HiCCUPS-like lambda-binning statistical procedure.
    This function aims at building up a histogram of locally adjusted
    expected scores for groups of characterized pixels.

    Such histograms are subsequently used to compute FDR thresholds
    for different "classes" of hypothesis (classified by their
    locally-adjusted expected (la_exp)).

    Parameters
    ----------
    scored_df : pd.DataFrame
        A table with the scoring information for a group of pixels.
    kernels : dict
        A dictionary with keys being kernels names and values being ndarrays
        representing those kernels.
    ledges : ndarray
        An ndarray with bin lambda-edges for groupping locally adjusted
        expecteds, i.e., classifying statistical hypothesis into lambda-bins.
        Left-most bin (-inf, 1], and right-most one (value,+inf].
    obs_raw_name : str
        Name of the column/field that carry number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    hists : dict of pandas.DataFrame
        A dictionary of pandas.DataFrame with lambda/observed 2D histogram for
        every kernel-type.


    Notes
    -----
    returning histograms corresponding to the chunks of scored pixels.
    r   r   F)dropnar   r   )	rf   rh   groupbyr   unstackfillnar   r   r   )	scored_dfrC   ledgesrq   histsr;   rX   s          r   histogram_scored_pixelsr  B  s    N E 
 
 y!41!4!4!45v>> |U3EERR#!### UWWWYYVAYYVBH 	a Lr   c                    i }i }|                                  D ]\  }}|j        ddd                             d          j        ddd         }|j        dddf         }t          j                                        |          }|j        D ]8}	|j                                        }
t          j
        |
|	j                  ||	<   9||z  }|j                                        dz   }||z  |z
                                  }|                    |dk                                                                   |                              t$          j                  ||<   ||z                                  ||<   ||                             ||         dk    d          ||<   t          j        ||         j                  ||         _        ||fS )aJ  
    given a 'gw_hist' histogram of observed counts
    for each lambda-bin and for each kernel-type, and
    also given a FDR, calculate q-values for each observed
    count value in each lambda-bin for each kernel-type.

    Parameters
    ----------
    gw_hist_kernels : dict
        dictionary {kernel_name : 2D_hist}, where '2D_hist' is a pd.DataFrame
    fdr : float
        False Discovery Rate level

    Returns
    -------
    threshold_df : dict
      each threshold_df[k] is a Series indexed by la_exp intervals
      (IntervalIndex) and it is all we need to extract "good" pixels from
      each chunk ...
    qvalues : dict
      A dictionary with keys being kernel names and values pandas.DataFrames
      storing q-values: each column corresponds to a lambda-bin,
      while rows correspond to observed pixels values.


    Nr   r   r1   g      ?)rM   iloccumsumrf   r   reindex_likerc   r   r  r   sfrightrO   cummaxmaskidxminr  r   r   r   cumminrg   )gw_histfdrrp   threshold_dfr;   _histrcs_histnormunit_Poissonlbin_occurances_high_valuefdr_diffs                r   determine_thresholdsr4    s   f GLMMOO %H %H5:ddd#***227"=}QT" |~~228<<$ 	E 	ED".1133K!(K!D!DLl* n((**Q.8^|3;;== MM(Q,''VXXVK  VBH	 	Q #X-5577
 QZ__WQZ#%5s;;
 " 0a1F G GQ  r   c                 X   g }|                                 D ]x\  }}| d| d         }t          j        ||d          }|j        |         }	|                    | |                                         |	                                k               y| t          j        |d                   S )a  
    Implementation of HiCCUPS-like lambda-binning statistical procedure.
    Use FDR thresholds for different "classes" of hypothesis
    (classified by their locally-adjusted expected (la_exp) scores),
    in order to extract "enriched" pixels.

    Parameters
    ----------
    scored_df : pd.DataFrame
        A table with the scoring information for a group of pixels.
    thresholds : dict
        A dictionary {kernel_name : lambda_thresholds}, where 'lambda_thresholds'
        are pd.Series with FDR thresholds indexed by lambda-bin intervals
    ledges : ndarray
        An ndarray with bin lambda-edges for groupping locally adjusted
        expecteds, i.e., classifying statistical hypothesis into lambda-bins.
        Left-most bin (-inf, 1], and right-most one (value,+inf].
    obs_raw_name : str
        Name of the column/field with number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    scored_df_slice : pandas.DataFrame
        Filtered DataFrame of pixels that satisfy thresholds.

    r   r   F)labelsr   r   )rM   rf   rh   r   rm   r  r   r  )
r  
thresholdsr  rq   compliant_pixel_masksr   r{   lambda_of_pixelslbin_idxthreshold_of_pixelss
             r   extract_scored_pixelsr<    s    8 ","2"2"4"4 	
 	
Y$%B{%B%B%BC6*F5AAA'nX6$$l#,,..2E2N2N2P2PP	
 	
 	
 	
 RV1:::;;r   regionc           
      2   dddd|h                     |           st          d          |                                 } || j        vrMt	          j        d           t          j        | d         | d         k    | d         t          j                  | |<   g }| 	                    |d          }|D ]D\  }}t	          j
        d	|            t          ||dd
          }|                    |           Et	          j
        d           |sMt	          j        d           t          j        g t          | j                  |dz   |dz   ddddgz             }	|	S t          j        |d          }
t          j        | |
ddd          }||                             t&                    ||dz   <   ||                             t&                    ||dz   <   |	                    |dz   |dz   dgd          }|j        ||                                                  }|S )a  
    Group together adjacent significant pixels into clusters after
    the lambda-binning multiple hypothesis testing by iterating over
    assigned regions and calling `clust_2D_pixels`.

    Parameters
    ----------
    scored_df : pandas.DataFrame
        DataFrame with enriched pixels that are ready to be
        clustered and are annotated with their genomic  coordinates.
    dots_clustering_radius : int
        Birch-clustering threshold.
    assigned_regions_name : str | None
        Name of the column in scored_df to use for grouping pixels
        before clustering. When None, full chromosome clustering is done.
    obs_raw_name : str
        name of the column with raw observed pixel counts
    Returns
    -------
    centroids : pandas.DataFrame
        Pixels from 'scored_df' annotated with clustering information.

    Notes
    -----
    'dots_clustering_radius' in Birch clustering algorithm corresponds to a
    double the clustering radius in the "greedy"-clustering used in HiCCUPS

    r#   r&   r$   r'   z7Scored pixels provided for clustering are not annotatedzMNo regions assigned to the scored pixels before clustering, using chromosomesT)r   z&clustering enriched pixels in region: )r   r   r   zClustering is completez7No clusters found for any regions! Output will be empty12rw   rx   cstart1cstart2rb   FrY   left)how
left_indexright_index)issubsetr?   rd   rc   r@   warningr   wherer   r  rA   r   rm   rf   r   re   concatri   r   strrn   idxmax)r  dots_clustering_radiusassigned_regions_namerq   pixel_clust_listscored_pixels_by_regionr=  _dfpixel_clustempty_outputpixel_clust_dfdfchrom_clust_group	centroidss                 r   clustering_steprX  
  s   F h(LAJJ9UU TRSSS  I!Y%666\	
 	
 	
 ,.8h9X#66	(8KRV,
 ,
	'(
 '//0EPT/UU. 	- 	-FfFFGGG%4!!	
 
 
 	,,,,L)***  
QRRR|*++%+%+
 
 
 5
 
 
 
>v$D
 
 
B '))>&?&F&Fs&K&KBs"#&()>&?&F&Fs&K&KBs"#

		$&;c&A9M #   ,'..00I r   g      ?g      ?g       @g{Gz?c                    d| vrt          d          dddd|h                    |           st          d          h d}|                    |           st          d	          | |         || d
         z  k    | |         || d         z  k    z  | |         || d         z  k    z  | |         || d         z  k    z  | |         || d
         z  k    | |         || d         z  k    z  z  | d         dk    | d         | d         z   | d         z   | d         z   |k    z  z  }t          j        d|                                 dt          |            d           | |                             d          S )ag  
    Centroids of enriched pixels can be filtered to further minimize
    the amount of false-positive dot-calls.

    First, centroids are filtered on enrichment relative to the
    locally-adjusted expected for the "donut", "lowleft", "vertical",
    and "horizontal" kernels. Additionally, singleton pixels
    (i.e. pixels that do not belong to a cluster) are filtered based on
    a combined q-values for all kernels. This empirical filtering approach
    was developed in Rao et al 2014 and results in a conservative dot-calls
    with the low rate of false-positive calls.

    Parameters
    ----------
    centroids : pd.DataFrame
        DataFrame that stores enriched and clustered pixels.
    obs_raw_name : str
        name of the column with raw observed pixel counts
    enrichment_factor_vh : float
        minimal enrichment factor for pixels relative to
        both "vertical" and "horizontal" kernel.
    enrichment_factor_d_and_ll : float
        minimal enrichment factor for pixels relative to
        both "donut" and "lowleft" kernels.
    enrichment_factor_d_or_ll : float
        minimal enrichment factor for pixels relative to
        either "donut" or" "lowleft" kenels.
    FDR_orphan_threshold : float
        minimal combined q-value for singleton pixels.

    Returns
    -------
    filtered_centroids : pd.DataFrame
        filtered dot-calls
    rx   z7input dataframe of pixels does not seem to be clusteredr#   r&   r$   r'   zKinput dataframe of clustered pixels provided for filtering is not annotated>   la_exp.donut.qvalla_exp.donut.valuela_exp.lowleft.qvalla_exp.lowleft.valuela_exp.vertical.qvalla_exp.vertical.valuela_exp.horizontal.qvalla_exp.horizontal.valuezNclustered pixels provided for filtering were not scored with 4 hiccups kernelsr]  r[  r_  ra  r1   r\  rZ  r^  r`  z	filtered z out of z2 centroids to reduce the number of false-positivesTr   )r?   rG  r@   rA   sumrH   rk   )rW  rq   enrichment_factor_vhenrichment_factor_d_and_llenrichment_factor_d_or_llFDR_orphan_threshold_hiccups_kernel_cols_setenrichment_fdr_complys           r   cluster_filtering_hiccupsri  s  s!   X y  STTT h(LAJJ9UU 
Y
 
 	


  
  
  $,,Y77 
\
 
 	
 l#(95K+LLM l#(95I+JJK	
 l#"Y/F%GGH	
 l#"Y/H%IIJ	
$ ,'+i8N.OOP ,'+i8L.MMN#	
6 x 1$ 34 345 678   89:
 ((	7%	
 R L{)--//{{Y{{{  
 *+77T7BBBr   c           
          t          j        dt          |           d           t          t          | |||||          t          t
          |          fd}|	dk    r@t          t          t          j	        t          |          |	z                                }ni } |
||fi |}fd}t          ||          }D ]}||         j        d	d	d
f         }|                                dk    rt          d| d|j         d          ||         j        ||                                         dk             d
         }||         j        d	d	d	|f         ||<   ||         j        j        st          d| d          |S )z
    This implements the 1st step of the lambda-binning scoring procedure - histogramming.

    In short, this pipes a scoring operation together with histogramming into a
    single pipeline of per-chunk operations/transforms.
    convolving z* tiles to build histograms for lambda-binsr   r  r  r	  rC   rQ   r   )rC   r  c                 ,      |                     S r*   r   )tileto_histto_scores    r   r   z0scoring_and_histogramming_step.<locals>.<lambda>  s    wwxx~~.. r   r1   	chunksizec                     i }D ]X}| |                              ||         d                              d                              t          j                  ||<   Y|S )Nr   )
fill_value)addr  r   r   r   )hxhyhxyr;   rC   s       r   
_sum_histsz2scoring_and_histogramming_step.<locals>._sum_hists$  s_     	O 	OAUYYr!uY33::1==DDRXNNCFF
r   Nr  r   zThere are la_exp.z
.value in z, please check the histogramzHistogram for z-kernel is not sorted)r@   rA   rH   r   r  r  rL   r+   r   ceilr   r   rb  r?   namerc   rn   r   is_monotonic_increasing)r   r  r  r	  tilesrC   r  rQ   loci_separation_binsnprocmap_functorjob
map_kwargshistogram_chunksry  
final_histr;   last_lambda_binlast_non_empty_lbinro  rp  s        `             @@r   scoring_and_histogramming_stepr    s   ( LUs5zzUUUVVV )-'-*	 	 	H -wvNNNG /
.
.
.
.C qyyCE

U0B(C(C$D$DEEE


 #{3<<<<     
$455J  H H$Q-,QQQU3  A%%cAcc1Eccc   )m3JqM4E4E4G4G!4KLRP"1)!!!-A.A-A*AB
1!}": 	HFaFFFGGG	H r   c           
         t          j        dt          |           d           t          t          | ||||||	          t          t
          ||          fd}|
dk    rht          j        d|
 dt          |           d	           t          t          t          j	        t          |          |
z                      
          }nt          j        d           i } |||fi |}t          j        |d          }|                                                                rt          d          |                    ||g                              d          S )z
    This implements the 2nd step of the lambda-binning scoring procedure,
    extracting pixels that are FDR compliant.

    In short, this combines scoring with with extraction into a
    single pipeline of per-chunk operations/transforms.

    rk  z! tiles to extract enriched pixelsrl  )r7  r  c                 ,      |                     S r*   r   )rn  
to_extractrp  s    r   r   z-scoring_and_extraction_step.<locals>.<lambda>o  s    zz((4..11 r   r1   zcreating a Pool of z workers to tackle z tilesrq  z"fallback to serial implementation.TrY   zRSome pixels were scored more than one time, matrix tiling procedure is not correct)byr   )r@   rA   rH   r   r  r<  rL   r+   r   rz  rf   rJ  
duplicatedanyr?   sort_valuesrk   )r   r  r  r	  r}  rC   r  r7  rQ   r~  r  r   r   r  r  r  filtered_pix_chunkssignificant_pixelsr  rp  s                     @@r   scoring_and_extraction_stepr  @  s   2 LLs5zzLLLMMM )-'-*	 	 	H   J 2
1
1
1
1C qyyW5WWSZZWWWXXXCE

U0B(C(C$D$DEEE

9:::
 &+c5??J??#6TJJJ $$&&**,, 
a
 
 	
 ))lL-I)JJVV W   r   balanced.avgweight逖 r1   (   皙? N  @KL c                    |t          |           }n7	 t          || dd          }n"# t          $ r}t          d          |d}~ww xY w|r:	 t	          | |d          }n5# t          $ r}t          d| d          |d}~ww xY wt          d          	 t          |d	|| |gd
          }n"# t          $ r}t          d          |d}~ww xY w|                    g d                                          }| j        }t          ||          }t          ||          }|r&t          |||          rt          j        d           nt          |          }t          d |                                D                       }t!          |dz
  dz            }d|cxk    rdk    sn t          d|           d}t#          j        t"          j         gt#          j        d|dz
  ||t"          j                  t"          j        gf          }t-          t/          | ||||                    }t1          j                    }t5          | |||||||||
  
        }t1          j                    |z
  }t7          j        d|dd           t;          ||	          \  }}t7          j        d           t1          j                    }t=          | ||||||||||dd          }t1          j                    |z
  }t7          j        d|dd           t7          j        d t?          |           d!           t7          j        d"           tA          ||          }tC          j"        || #                                g d#         d$          }|
s=g } | tH          z  } | tJ          gz  } | d% |D             z  } | d& |D             z  } ||          S tM          ||          }tO          ||
          (                    d'          }!g } | tH          z  } | d(d)d*d+tJ          gz  } | d, |D             z  } | d- |D             z  } ||rtS          |!          }"n|s|!}"|"S ).a  
    Call dots on a cooler {clr}, using {expected} defined in regions specified
    in {view_df}.

    All convolution kernels specified in {kernels} will be all applied to the {clr},
    and statistical testing will be performed separately for each kernel. A convolutional
    kernel is a small squared matrix (e.g. 7x7) of zeros and ones
    that defines a "mask" to extract local expected around each pixel. Since the
    enrichment is calculated relative to the central pixel, kernel width should
    be an odd number >=3.

    Parameters
    ----------
    clr : cooler.Cooler
        A cooler with balanced Hi-C data.
    expected : DataFrame in expected format
        Diagonal summary statistics for each chromosome, and name of the column
        with the values of expected to use.
    expected_value_col : str
        Name of the column in expected that holds the values of expected
    clr_weight_name : str
        Name of the column in the clr.bins to use as balancing weights.
        Using raw unbalanced data is not supported for dot-calling.
    view_df : viewframe
        Viewframe with genomic regions, at the moment the view has to match the
        view used for generating expected. If None, generate from the cooler.
    kernels : { str:np.ndarray } | None
        A dictionary of convolution kernels to be used for calculating locally adjusted
        expected. If None the default kernels from HiCCUPS are going to be recommended
        based on the resolution of the cooler.
    max_loci_separation : int
        Miaximum loci separation for dot-calling, i.e., do not call dots for
        loci that are further than max_loci_separation basepair apart. default 10Mb.
    max_nans_tolerated : int
        Maximum number of NaNs tolerated in a footprint of every used kernel
        Adjust with caution, as large max_nans_tolerated, might lead to artifacts in
        pixels scoring.
    n_lambda_bins : int
        Number of log-spaced bins, where FDR-testing will be performed independently.
        TODO: generate lambda-bins on the fly based on the dynamic range of the data (i.e. maximum pixel count)
    lambda_bin_fdr : float
        False discovery rate (FDR) for multiple hypothesis testing BH-FDR procedure, applied per lambda bin.
    clustering_radius : None | int
        Cluster enriched pixels with a given radius. "Brightest" pixels in each group
        will be reported as the final dot-calls. If None, no clustering is performed.
    cluster_filtering : bool
        whether to apply additional filtering to centroids after clustering, using cluster_filtering_hiccups()
    tile_size : int
        Tile size for the Hi-C heatmap tiling. Typically on order of several mega-bases, and <= max_loci_separation.
        Controls tradeoff between memory consumption and speed of execution.
    nproc : int
        Number of processes to use for multiprocessing.

    Returns
    -------
    dots : pandas.DataFrame
        BEDPE-style dataFrame with genomic coordinates of called dots and additional annotations.

    Notes
    -----
    'clustering_radius' in Birch clustering algorithm corresponds to a
    double the clustering radius in the "greedy"-clustering used in HiCCUPS
    (to be tested).

    TODO describe sequence of processing steps

    NT)check_sortingraise_errorsz0view_df is not a valid viewframe or incompatible)r  z#provided cooler is not balanced or z is missingz*calling dots on raw data is not supported.cis)verify_coolerexpected_value_colsr  z#provided expected is not compatible)region1region2distzVCompatibility checks for 'kernels' are not fully implemented yet, use at your own riskc              3   4   K   | ]}t          |          V  d S r*   rG   r   s     r   	<genexpr>zdots.<locals>.<genexpr>  s(      88!s1vv888888r   r1   r   r  2   zIncompatible n_lambda_bins=gr(?r   )numbaser  )r  r	  r}  rC   r  rQ   r~  r  zDone building histograms in z.3fz sec ...z.Determined thresholds for every lambda-bin ...r!   r"   )r  r	  r}  rC   r  r7  rQ   r~  r  r   r   z#Done extracting enriched pixels in zBegin post-processing of z filtered pixelsz(preparing to extract needed q-values ...)r   r   r   )replacec                     g | ]}d | d	S r   r   r   s     r   rJ   zdots.<locals>.<listcomp>m  s$    ===+!+++===r   c                     g | ]}d | d	S r   r`   r   r   s     r   rJ   zdots.<locals>.<listcomp>n  s$    <<<q*!***<<<r   r   rA  rB  rw   rx   c                     g | ]}d | d	S r   r   r   s     r   rJ   zdots.<locals>.<listcomp>  s$    999A'a'''999r   c                     g | ]}d | d	S r  r   r   s     r   rJ   zdots.<locals>.<listcomp>  s$    8881&a&&&888r   )*r   r   	Exceptionr?   r   r   	set_index
sort_indexr-   r.   rU   warningswarnrD   rO   r   r+   r   concatenateinflogspacer   re   r   timeperf_counterr  r@   rA   r4  r  rH   rv   coolerannotaterW   bedpe_required_colsobserved_count_namer   rX  rk   ri  )#r   r   r  r	  r   rC   max_loci_separationrQ   n_lambda_binslambda_bin_fdrclustering_radiuscluster_filteringr   r  r\   er-   r~  tile_size_binsrS   rT   BASEr  r}  
time_startr)  elapsed_timer+  rp   filtered_pixelsfiltered_pixels_qvalsfiltered_pixels_annotatedoutput_colsrW  postprocessed_callss#                                      r   dotsr    s   j "3''	X'"!	  AA  	X 	X 	XOPPVWW	X  	G	"3dKKKAA 	 	 	RoRRR 	
 EFFFG"! 	
 	
 	
  G G G>??QFG!!"@"@"@AALLNNH kG%&97CC	733N  -(';MNN -d	
 	
 	
 	

 $G,,88w~~'7'788888L\A-233 $$$$"$$$$F}FFGGGD^fWIK!!j   VH
	
 F  +^=Q	
 	
 E "$$J,-'-1  G $&&3LLJJJJJKKK 1.IIL'LABBB "$$J1-'-1  O $&&3LLQ|QQQQRRR LSS-A-ASSSTTTL;<<< 9'RR &sxxzz*C*C*CDd! ! !  
6 **
 	
 	==W====<<G<<<<(55 !//H' R R!  ktk  K&&K K 999999K888888K 	!&7!7	BB ('sD   ( 
AAAA   
B*A>>BB- -
C7CC)r   r!   r"   rw   rx   )r   )r  r  NNr  r1   r  r  r  Nr  r1   );__doc__	functoolsr   r   r@   r  r  scipy.ndimager   scipy.statsr   numpyr   pandasrf   sklearn.clusterr   r  lib.numutilsr	   r
   
lib.commonr   r   
lib.checksr   r   r   r   simplefiltererrorsPerformanceWarningbasicConfigINFOr  expected_count_nameadjusted_exp_namenans_inkernel_namer   r   r  r.   rD   rU   rv   r   r   r   r   r  r  r4  r<  rX  ri  mapr  r  r  r   r   r   <module>r     s  J JX & % % % % % % %    " " " " " "               ! ! ! ! ! !  3 3 3 3 3 3 3 3 9 9 9 9 9 9 9 9         
 ( ' ' ' ' '  X	0L M M M M  ', ' ' ' '   EE FF    $ $ $5 5 5p* * *Z CV 0) 0) 0) 0)j c! c! c! c!VGY GY GY GYT1< 1< 1<nT T TteF eF eFR .A; ; ; ;|\! \! \!~ GZ (< (< (< (<\ #$	f f f fV %#!wC wC wC wC|  L L L L\  D D D DV &"
D D D D D Dr   